001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.archivers.dump;
020
021import java.io.EOFException;
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.Arrays;
025import java.util.HashMap;
026import java.util.Map;
027import java.util.PriorityQueue;
028import java.util.Queue;
029import java.util.Stack;
030
031import org.apache.commons.compress.archivers.ArchiveException;
032import org.apache.commons.compress.archivers.ArchiveInputStream;
033import org.apache.commons.compress.archivers.zip.ZipEncoding;
034import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
035import org.apache.commons.compress.utils.IOUtils;
036
037/**
038 * The DumpArchiveInputStream reads a UNIX dump archive as an InputStream.
039 * Methods are provided to position at each successive entry in
040 * the archive, and the read each entry as a normal input stream
041 * using read().
042 *
043 * There doesn't seem to exist a hint on the encoding of string values
044 * in any piece documentation.  Given the main purpose of dump/restore
045 * is backing up a system it seems very likely the format uses the
046 * current default encoding of the system.
047 *
048 * @NotThreadSafe
049 */
050public class DumpArchiveInputStream extends ArchiveInputStream<DumpArchiveEntry> {
051    /**
052     * Look at the first few bytes of the file to decide if it's a dump
053     * archive. With 32 bytes we can look at the magic value, with a full
054     * 1k we can verify the checksum.
055     * @param buffer data to match
056     * @param length length of data
057     * @return whether the buffer seems to contain dump data
058     */
059    public static boolean matches(final byte[] buffer, final int length) {
060        // do we have enough of the header?
061        if (length < 32) {
062            return false;
063        }
064
065        // this is the best test
066        if (length >= DumpArchiveConstants.TP_SIZE) {
067            return DumpArchiveUtil.verify(buffer);
068        }
069
070        // this will work in a pinch.
071        return DumpArchiveConstants.NFS_MAGIC == DumpArchiveUtil.convert32(buffer,
072            24);
073    }
074    private final DumpArchiveSummary summary;
075    private DumpArchiveEntry active;
076    private boolean isClosed;
077    private boolean hasHitEOF;
078    private long entrySize;
079    private long entryOffset;
080    private int readIdx;
081    private final byte[] readBuf = new byte[DumpArchiveConstants.TP_SIZE];
082    private byte[] blockBuffer;
083    private int recordOffset;
084    private long filepos;
085
086    protected TapeInputStream raw;
087
088    // map of ino -> dirent entry. We can use this to reconstruct full paths.
089    private final Map<Integer, Dirent> names = new HashMap<>();
090
091    // map of ino -> (directory) entry when we're missing one or more elements in the path.
092    private final Map<Integer, DumpArchiveEntry> pending = new HashMap<>();
093
094    // queue of (directory) entries where we now have the full path.
095    private final Queue<DumpArchiveEntry> queue;
096
097    /**
098     * The encoding to use for file names and labels.
099     */
100    private final ZipEncoding zipEncoding;
101
102    // the provided encoding (for unit tests)
103    final String encoding;
104
105    /**
106     * Constructor using the platform's default encoding for file
107     * names.
108     *
109     * @param is stream to read from
110     * @throws ArchiveException on error
111     */
112    public DumpArchiveInputStream(final InputStream is) throws ArchiveException {
113        this(is, null);
114    }
115
116    /**
117     * Constructor.
118     *
119     * @param is stream to read from
120     * @param encoding the encoding to use for file names, use null
121     * for the platform's default encoding
122     * @since 1.6
123     * @throws ArchiveException on error
124     */
125    public DumpArchiveInputStream(final InputStream is, final String encoding)
126        throws ArchiveException {
127        this.raw = new TapeInputStream(is);
128        this.hasHitEOF = false;
129        this.encoding = encoding;
130        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
131
132        try {
133            // read header, verify it's a dump archive.
134            final byte[] headerBytes = raw.readRecord();
135
136            if (!DumpArchiveUtil.verify(headerBytes)) {
137                throw new UnrecognizedFormatException();
138            }
139
140            // get summary information
141            summary = new DumpArchiveSummary(headerBytes, this.zipEncoding);
142
143            // reset buffer with actual block size.
144            raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());
145
146            // allocate our read buffer.
147            blockBuffer = new byte[4 * DumpArchiveConstants.TP_SIZE];
148
149            // skip past CLRI and BITS segments since we don't handle them yet.
150            readCLRI();
151            readBITS();
152        } catch (final IOException ex) {
153            throw new ArchiveException(ex.getMessage(), ex);
154        }
155
156        // put in a dummy record for the root node.
157        final Dirent root = new Dirent(2, 2, 4, ".");
158        names.put(2, root);
159
160        // use priority based on queue to ensure parent directories are
161        // released first.
162        queue = new PriorityQueue<>(10,
163                (p, q) -> {
164                    if (p.getOriginalName() == null || q.getOriginalName() == null) {
165                        return Integer.MAX_VALUE;
166                    }
167
168                    return p.getOriginalName().compareTo(q.getOriginalName());
169                });
170    }
171
172    /**
173     * Closes the stream for this entry.
174     */
175    @Override
176    public void close() throws IOException {
177        if (!isClosed) {
178            isClosed = true;
179            raw.close();
180        }
181    }
182
183    @Override
184    public long getBytesRead() {
185        return raw.getBytesRead();
186    }
187
188    @Deprecated
189    @Override
190    public int getCount() {
191        return (int) getBytesRead();
192    }
193
194    /**
195     * Reads the next entry.
196     *
197     * @return the next entry
198     * @throws IOException on error
199     * @deprecated Use {@link #getNextEntry()}.
200     */
201    @Deprecated
202    public DumpArchiveEntry getNextDumpEntry() throws IOException {
203        return getNextEntry();
204    }
205
206    @Override
207    public DumpArchiveEntry getNextEntry() throws IOException {
208        DumpArchiveEntry entry = null;
209        String path = null;
210
211        // is there anything in the queue?
212        if (!queue.isEmpty()) {
213            return queue.remove();
214        }
215
216        while (entry == null) {
217            if (hasHitEOF) {
218                return null;
219            }
220
221            // skip any remaining records in this segment for prior file.
222            // we might still have holes... easiest to do it
223            // block by block. We may want to revisit this if
224            // the unnecessary decompression time adds up.
225            while (readIdx < active.getHeaderCount()) {
226                if (!active.isSparseRecord(readIdx++)
227                    && raw.skip(DumpArchiveConstants.TP_SIZE) == -1) {
228                    throw new EOFException();
229                }
230            }
231
232            readIdx = 0;
233            filepos = raw.getBytesRead();
234
235            byte[] headerBytes = raw.readRecord();
236
237            if (!DumpArchiveUtil.verify(headerBytes)) {
238                throw new InvalidFormatException();
239            }
240
241            active = DumpArchiveEntry.parse(headerBytes);
242
243            // skip any remaining segments for prior file.
244            while (DumpArchiveConstants.SEGMENT_TYPE.ADDR == active.getHeaderType()) {
245                if (raw.skip((long) DumpArchiveConstants.TP_SIZE
246                             * (active.getHeaderCount()
247                                - active.getHeaderHoles())) == -1) {
248                    throw new EOFException();
249                }
250
251                filepos = raw.getBytesRead();
252                headerBytes = raw.readRecord();
253
254                if (!DumpArchiveUtil.verify(headerBytes)) {
255                    throw new InvalidFormatException();
256                }
257
258                active = DumpArchiveEntry.parse(headerBytes);
259            }
260
261            // check if this is an end-of-volume marker.
262            if (DumpArchiveConstants.SEGMENT_TYPE.END == active.getHeaderType()) {
263                hasHitEOF = true;
264
265                return null;
266            }
267
268            entry = active;
269
270            if (entry.isDirectory()) {
271                readDirectoryEntry(active);
272
273                // now we create an empty InputStream.
274                entryOffset = 0;
275                entrySize = 0;
276                readIdx = active.getHeaderCount();
277            } else {
278                entryOffset = 0;
279                entrySize = active.getEntrySize();
280                readIdx = 0;
281            }
282
283            recordOffset = readBuf.length;
284
285            path = getPath(entry);
286
287            if (path == null) {
288                entry = null;
289            }
290        }
291
292        entry.setName(path);
293        entry.setSimpleName(names.get(entry.getIno()).getName());
294        entry.setOffset(filepos);
295
296        return entry;
297    }
298
299    /**
300     * Gets full path for specified archive entry, or null if there's a gap.
301     *
302     * @param entry
303     * @return  full path for specified archive entry, or null if there's a gap.
304     */
305    private String getPath(final DumpArchiveEntry entry) {
306        // build the stack of elements. It's possible that we're
307        // still missing an intermediate value and if so we
308        final Stack<String> elements = new Stack<>();
309        Dirent dirent = null;
310
311        for (int i = entry.getIno();; i = dirent.getParentIno()) {
312            if (!names.containsKey(i)) {
313                elements.clear();
314                break;
315            }
316
317            dirent = names.get(i);
318            elements.push(dirent.getName());
319
320            if (dirent.getIno() == dirent.getParentIno()) {
321                break;
322            }
323        }
324
325        // if an element is missing defer the work and read next entry.
326        if (elements.isEmpty()) {
327            pending.put(entry.getIno(), entry);
328
329            return null;
330        }
331
332        // generate full path from stack of elements.
333        final StringBuilder sb = new StringBuilder(elements.pop());
334
335        while (!elements.isEmpty()) {
336            sb.append('/');
337            sb.append(elements.pop());
338        }
339
340        return sb.toString();
341    }
342
343    /**
344     * Return the archive summary information.
345     * @return the summary
346     */
347    public DumpArchiveSummary getSummary() {
348        return summary;
349    }
350
351    /**
352     * Reads bytes from the current dump archive entry.
353     *
354     * This method is aware of the boundaries of the current
355     * entry in the archive and will deal with them as if they
356     * were this stream's start and EOF.
357     *
358     * @param buf The buffer into which to place bytes read.
359     * @param off The offset at which to place bytes read.
360     * @param len The number of bytes to read.
361     * @return The number of bytes read, or -1 at EOF.
362     * @throws IOException on error
363     */
364    @Override
365    public int read(final byte[] buf, int off, int len) throws IOException {
366        if (len == 0) {
367            return 0;
368        }
369        int totalRead = 0;
370
371        if (hasHitEOF || isClosed || entryOffset >= entrySize) {
372            return -1;
373        }
374
375        if (active == null) {
376            throw new IllegalStateException("No current dump entry");
377        }
378
379        if (len + entryOffset > entrySize) {
380            len = (int) (entrySize - entryOffset);
381        }
382
383        while (len > 0) {
384            final int sz = Math.min(len, readBuf.length - recordOffset);
385
386            // copy any data we have
387            if (recordOffset + sz <= readBuf.length) {
388                System.arraycopy(readBuf, recordOffset, buf, off, sz);
389                totalRead += sz;
390                recordOffset += sz;
391                len -= sz;
392                off += sz;
393            }
394
395            // load next block if necessary.
396            if (len > 0) {
397                if (readIdx >= 512) {
398                    final byte[] headerBytes = raw.readRecord();
399
400                    if (!DumpArchiveUtil.verify(headerBytes)) {
401                        throw new InvalidFormatException();
402                    }
403
404                    active = DumpArchiveEntry.parse(headerBytes);
405                    readIdx = 0;
406                }
407
408                if (!active.isSparseRecord(readIdx++)) {
409                    final int r = raw.read(readBuf, 0, readBuf.length);
410                    if (r != readBuf.length) {
411                        throw new EOFException();
412                    }
413                } else {
414                    Arrays.fill(readBuf, (byte) 0);
415                }
416
417                recordOffset = 0;
418            }
419        }
420
421        entryOffset += totalRead;
422
423        return totalRead;
424    }
425
426    /**
427     * Read BITS segment.
428     */
429    private void readBITS() throws IOException {
430        final byte[] buffer = raw.readRecord();
431
432        if (!DumpArchiveUtil.verify(buffer)) {
433            throw new InvalidFormatException();
434        }
435
436        active = DumpArchiveEntry.parse(buffer);
437
438        if (DumpArchiveConstants.SEGMENT_TYPE.BITS != active.getHeaderType()) {
439            throw new InvalidFormatException();
440        }
441
442        // we don't do anything with this yet.
443        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount())
444            == -1) {
445            throw new EOFException();
446        }
447        readIdx = active.getHeaderCount();
448    }
449
450    /**
451     * Read CLRI (deleted inode) segment.
452     */
453    private void readCLRI() throws IOException {
454        final byte[] buffer = raw.readRecord();
455
456        if (!DumpArchiveUtil.verify(buffer)) {
457            throw new InvalidFormatException();
458        }
459
460        active = DumpArchiveEntry.parse(buffer);
461
462        if (DumpArchiveConstants.SEGMENT_TYPE.CLRI != active.getHeaderType()) {
463            throw new InvalidFormatException();
464        }
465
466        // we don't do anything with this yet.
467        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount())
468            == -1) {
469            throw new EOFException();
470        }
471        readIdx = active.getHeaderCount();
472    }
473
474    /**
475     * Read directory entry.
476     */
477    private void readDirectoryEntry(DumpArchiveEntry entry)
478        throws IOException {
479        long size = entry.getEntrySize();
480        boolean first = true;
481
482        while (first ||
483                DumpArchiveConstants.SEGMENT_TYPE.ADDR == entry.getHeaderType()) {
484            // read the header that we just peeked at.
485            if (!first) {
486                raw.readRecord();
487            }
488
489            if (!names.containsKey(entry.getIno()) &&
490                    DumpArchiveConstants.SEGMENT_TYPE.INODE == entry.getHeaderType()) {
491                pending.put(entry.getIno(), entry);
492            }
493
494            final int datalen = DumpArchiveConstants.TP_SIZE * entry.getHeaderCount();
495
496            if (blockBuffer.length < datalen) {
497                blockBuffer = IOUtils.readRange(raw, datalen);
498                if (blockBuffer.length != datalen) {
499                    throw new EOFException();
500                }
501            } else if (raw.read(blockBuffer, 0, datalen) != datalen) {
502                throw new EOFException();
503            }
504
505            int reclen = 0;
506
507            for (int i = 0; i < datalen - 8 && i < size - 8;
508                    i += reclen) {
509                final int ino = DumpArchiveUtil.convert32(blockBuffer, i);
510                reclen = DumpArchiveUtil.convert16(blockBuffer, i + 4);
511
512                final byte type = blockBuffer[i + 6];
513
514                final String name = DumpArchiveUtil.decode(zipEncoding, blockBuffer, i + 8, blockBuffer[i + 7]);
515
516                if (".".equals(name) || "..".equals(name)) {
517                    // do nothing...
518                    continue;
519                }
520
521                final Dirent d = new Dirent(ino, entry.getIno(), type, name);
522
523                /*
524                if ((type == 4) && names.containsKey(ino)) {
525                    System.out.println("we already have ino: " +
526                                       names.get(ino));
527                }
528                */
529
530                names.put(ino, d);
531
532                // check whether this allows us to fill anything in the pending list.
533                pending.forEach((k, v) -> {
534                    final String path = getPath(v);
535
536                    if (path != null) {
537                        v.setName(path);
538                        v.setSimpleName(names.get(k).getName());
539                        queue.add(v);
540                    }
541                });
542
543                // remove anything that we found. (We can't do it earlier
544                // because of concurrent modification exceptions.)
545                queue.forEach(e -> pending.remove(e.getIno()));
546            }
547
548            final byte[] peekBytes = raw.peek();
549
550            if (!DumpArchiveUtil.verify(peekBytes)) {
551                throw new InvalidFormatException();
552            }
553
554            entry = DumpArchiveEntry.parse(peekBytes);
555            first = false;
556            size -= DumpArchiveConstants.TP_SIZE;
557        }
558    }
559
560}