001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018/* 019 * This package is based on the work done by Timothy Gerard Endres 020 * (time@ice.com) to whom the Ant project is very grateful for his great code. 021 */ 022 023package org.apache.commons.compress.archivers.tar; 024 025import java.io.ByteArrayOutputStream; 026import java.io.FileInputStream; 027import java.io.IOException; 028import java.io.InputStream; 029import java.util.ArrayList; 030import java.util.Arrays; 031import java.util.HashMap; 032import java.util.List; 033import java.util.Map; 034 035import org.apache.commons.compress.archivers.ArchiveEntry; 036import org.apache.commons.compress.archivers.ArchiveInputStream; 037import org.apache.commons.compress.archivers.zip.ZipEncoding; 038import org.apache.commons.compress.archivers.zip.ZipEncodingHelper; 039import org.apache.commons.compress.utils.ArchiveUtils; 040import org.apache.commons.compress.utils.BoundedInputStream; 041import org.apache.commons.compress.utils.IOUtils; 042 043/** 044 * The TarInputStream reads a UNIX tar archive as an InputStream. 045 * methods are provided to position at each successive entry in 046 * the archive, and the read each entry as a normal input stream 047 * using read(). 048 * @NotThreadSafe 049 */ 050public class TarArchiveInputStream extends ArchiveInputStream<TarArchiveEntry> { 051 052 private static final int SMALL_BUFFER_SIZE = 256; 053 054 /** 055 * Checks if the signature matches what is expected for a tar file. 056 * 057 * @param signature 058 * the bytes to check 059 * @param length 060 * the number of bytes to check 061 * @return true, if this stream is a tar archive stream, false otherwise 062 */ 063 public static boolean matches(final byte[] signature, final int length) { 064 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) { 065 return false; 066 } 067 068 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, 069 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 070 && 071 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, 072 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 073 ){ 074 return true; 075 } 076 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, 077 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 078 && 079 ( 080 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, 081 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 082 || 083 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, 084 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 085 ) 086 ){ 087 return true; 088 } 089 // COMPRESS-107 - recognize Ant tar files 090 return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, 091 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 092 && 093 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, 094 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN); 095 } 096 097 private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE]; 098 099 /** The size the TAR header */ 100 private final int recordSize; 101 102 /** The buffer to store the TAR header **/ 103 private final byte[] recordBuffer; 104 105 /** The size of a block */ 106 private final int blockSize; 107 108 /** True if file has hit EOF */ 109 private boolean hasHitEOF; 110 111 /** Size of the current entry */ 112 private long entrySize; 113 114 /** How far into the entry the stream is at */ 115 private long entryOffset; 116 117 /** An input stream to read from */ 118 private final InputStream inputStream; 119 120 /** Input streams for reading sparse entries **/ 121 private List<InputStream> sparseInputStreams; 122 123 /** the index of current input stream being read when reading sparse entries */ 124 private int currentSparseInputStreamIndex; 125 126 /** The meta-data about the current entry */ 127 private TarArchiveEntry currEntry; 128 129 /** The encoding of the file */ 130 private final ZipEncoding zipEncoding; 131 132 // the provided encoding (for unit tests) 133 final String encoding; 134 135 // the global PAX header 136 private Map<String, String> globalPaxHeaders = new HashMap<>(); 137 138 // the global sparse headers, this is only used in PAX Format 0.X 139 private final List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>(); 140 141 private final boolean lenient; 142 143 /** 144 * Constructor for TarInputStream. 145 * @param is the input stream to use 146 */ 147 public TarArchiveInputStream(final InputStream is) { 148 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE); 149 } 150 151 /** 152 * Constructor for TarInputStream. 153 * @param is the input stream to use 154 * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be 155 * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an 156 * exception instead. 157 * @since 1.19 158 */ 159 public TarArchiveInputStream(final InputStream is, final boolean lenient) { 160 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient); 161 } 162 163 /** 164 * Constructor for TarInputStream. 165 * @param is the input stream to use 166 * @param blockSize the block size to use 167 */ 168 public TarArchiveInputStream(final InputStream is, final int blockSize) { 169 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE); 170 } 171 172 /** 173 * Constructor for TarInputStream. 174 * @param is the input stream to use 175 * @param blockSize the block size to use 176 * @param recordSize the record size to use 177 */ 178 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) { 179 this(is, blockSize, recordSize, null); 180 } 181 182 /** 183 * Constructor for TarInputStream. 184 * @param is the input stream to use 185 * @param blockSize the block size to use 186 * @param recordSize the record size to use 187 * @param encoding name of the encoding to use for file names 188 * @since 1.4 189 */ 190 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, 191 final String encoding) { 192 this(is, blockSize, recordSize, encoding, false); 193 } 194 195 /** 196 * Constructor for TarInputStream. 197 * @param is the input stream to use 198 * @param blockSize the block size to use 199 * @param recordSize the record size to use 200 * @param encoding name of the encoding to use for file names 201 * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be 202 * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an 203 * exception instead. 204 * @since 1.19 205 */ 206 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, 207 final String encoding, final boolean lenient) { 208 this.inputStream = is; 209 this.hasHitEOF = false; 210 this.encoding = encoding; 211 this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); 212 this.recordSize = recordSize; 213 this.recordBuffer = new byte[recordSize]; 214 this.blockSize = blockSize; 215 this.lenient = lenient; 216 } 217 218 /** 219 * Constructor for TarInputStream. 220 * @param is the input stream to use 221 * @param blockSize the block size to use 222 * @param encoding name of the encoding to use for file names 223 * @since 1.4 224 */ 225 public TarArchiveInputStream(final InputStream is, final int blockSize, 226 final String encoding) { 227 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding); 228 } 229 230 /** 231 * Constructor for TarInputStream. 232 * @param is the input stream to use 233 * @param encoding name of the encoding to use for file names 234 * @since 1.4 235 */ 236 public TarArchiveInputStream(final InputStream is, final String encoding) { 237 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, 238 encoding); 239 } 240 241 private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers, final List<TarArchiveStructSparse> sparseHeaders) 242 throws IOException { 243 currEntry.updateEntryFromPaxHeaders(headers); 244 currEntry.setSparseHeaders(sparseHeaders); 245 } 246 247 /** 248 * Gets the available data that can be read from the current 249 * entry in the archive. This does not indicate how much data 250 * is left in the entire archive, only in the current entry. 251 * This value is determined from the entry's size header field 252 * and the amount of data already read from the current entry. 253 * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE 254 * bytes are left in the current entry in the archive. 255 * 256 * @return The number of available bytes for the current entry. 257 * @throws IOException for signature 258 */ 259 @Override 260 public int available() throws IOException { 261 if (isDirectory()) { 262 return 0; 263 } 264 265 if (currEntry.getRealSize() - entryOffset > Integer.MAX_VALUE) { 266 return Integer.MAX_VALUE; 267 } 268 return (int) (currEntry.getRealSize() - entryOffset); 269 } 270 271 272 /** 273 * Build the input streams consisting of all-zero input streams and non-zero input streams. 274 * When reading from the non-zero input streams, the data is actually read from the original input stream. 275 * The size of each input stream is introduced by the sparse headers. 276 * 277 * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the 278 * 0 size input streams because they are meaningless. 279 */ 280 private void buildSparseInputStreams() throws IOException { 281 currentSparseInputStreamIndex = -1; 282 sparseInputStreams = new ArrayList<>(); 283 284 final List<TarArchiveStructSparse> sparseHeaders = currEntry.getOrderedSparseHeaders(); 285 286 // Stream doesn't need to be closed at all as it doesn't use any resources 287 final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); //NOSONAR 288 // logical offset into the extracted entry 289 long offset = 0; 290 for (final TarArchiveStructSparse sparseHeader : sparseHeaders) { 291 final long zeroBlockSize = sparseHeader.getOffset() - offset; 292 if (zeroBlockSize < 0) { 293 // sparse header says to move backwards inside the extracted entry 294 throw new IOException("Corrupted struct sparse detected"); 295 } 296 297 // only store the zero block if it is not empty 298 if (zeroBlockSize > 0) { 299 sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset() - offset)); 300 } 301 302 // only store the input streams with non-zero size 303 if (sparseHeader.getNumbytes() > 0) { 304 sparseInputStreams.add(new BoundedInputStream(inputStream, sparseHeader.getNumbytes())); 305 } 306 307 offset = sparseHeader.getOffset() + sparseHeader.getNumbytes(); 308 } 309 310 if (!sparseInputStreams.isEmpty()) { 311 currentSparseInputStreamIndex = 0; 312 } 313 } 314 315 /** 316 * Whether this class is able to read the given entry. 317 * 318 * @return The implementation will return true if the {@link ArchiveEntry} is an instance of {@link TarArchiveEntry} 319 */ 320 @Override 321 public boolean canReadEntryData(final ArchiveEntry ae) { 322 return ae instanceof TarArchiveEntry; 323 } 324 325 /** 326 * Closes this stream. Calls the TarBuffer's close() method. 327 * @throws IOException on error 328 */ 329 @Override 330 public void close() throws IOException { 331 // Close all the input streams in sparseInputStreams 332 if (sparseInputStreams != null) { 333 for (final InputStream inputStream : sparseInputStreams) { 334 inputStream.close(); 335 } 336 } 337 338 inputStream.close(); 339 } 340 341 /** 342 * This method is invoked once the end of the archive is hit, it 343 * tries to consume the remaining bytes under the assumption that 344 * the tool creating this archive has padded the last block. 345 */ 346 private void consumeRemainderOfLastBlock() throws IOException { 347 final long bytesReadOfLastBlock = getBytesRead() % blockSize; 348 if (bytesReadOfLastBlock > 0) { 349 final long skipped = IOUtils.skip(inputStream, blockSize - bytesReadOfLastBlock); 350 count(skipped); 351 } 352 } 353 354 /** 355 * For FileInputStream, the skip always return the number you input, so we 356 * need the available bytes to determine how many bytes are actually skipped 357 * 358 * @param available available bytes returned by inputStream.available() 359 * @param skipped skipped bytes returned by inputStream.skip() 360 * @param expected bytes expected to skip 361 * @return number of bytes actually skipped 362 * @throws IOException if a truncated tar archive is detected 363 */ 364 private long getActuallySkipped(final long available, final long skipped, final long expected) throws IOException { 365 long actuallySkipped = skipped; 366 if (inputStream instanceof FileInputStream) { 367 actuallySkipped = Math.min(skipped, available); 368 } 369 370 if (actuallySkipped != expected) { 371 throw new IOException("Truncated TAR archive"); 372 } 373 374 return actuallySkipped; 375 } 376 377 /** 378 * Gets the current TAR Archive Entry that this input stream is processing 379 * 380 * @return The current Archive Entry 381 */ 382 public TarArchiveEntry getCurrentEntry() { 383 return currEntry; 384 } 385 386 /** 387 * Gets the next entry in this tar archive as long name data. 388 * 389 * @return The next entry in the archive as long name data, or null. 390 * @throws IOException on error 391 */ 392 protected byte[] getLongNameData() throws IOException { 393 // read in the name 394 final ByteArrayOutputStream longName = new ByteArrayOutputStream(); 395 int length = 0; 396 while ((length = read(smallBuf)) >= 0) { 397 longName.write(smallBuf, 0, length); 398 } 399 getNextEntry(); 400 if (currEntry == null) { 401 // Bugzilla: 40334 402 // Malformed tar file - long entry name not followed by entry 403 return null; 404 } 405 byte[] longNameData = longName.toByteArray(); 406 // remove trailing null terminator(s) 407 length = longNameData.length; 408 while (length > 0 && longNameData[length - 1] == 0) { 409 --length; 410 } 411 if (length != longNameData.length) { 412 longNameData = Arrays.copyOf(longNameData, length); 413 } 414 return longNameData; 415 } 416 417 /** 418 * Returns the next Archive Entry in this Stream. 419 * 420 * @return the next entry, 421 * or {@code null} if there are no more entries 422 * @throws IOException if the next entry could not be read 423 */ 424 @Override 425 public TarArchiveEntry getNextEntry() throws IOException { 426 return getNextTarEntry(); 427 } 428 429 /** 430 * Gets the next entry in this tar archive. This will skip 431 * over any remaining data in the current entry, if there 432 * is one, and place the input stream at the header of the 433 * next entry, and read the header and instantiate a new 434 * TarEntry from the header bytes and return that entry. 435 * If there are no more entries in the archive, null will 436 * be returned to indicate that the end of the archive has 437 * been reached. 438 * 439 * @return The next TarEntry in the archive, or null. 440 * @throws IOException on error 441 * @deprecated Use {@link #getNextEntry()}. 442 */ 443 @Deprecated 444 public TarArchiveEntry getNextTarEntry() throws IOException { 445 if (isAtEOF()) { 446 return null; 447 } 448 449 if (currEntry != null) { 450 /* Skip will only go to the end of the current entry */ 451 IOUtils.skip(this, Long.MAX_VALUE); 452 453 /* skip to the end of the last record */ 454 skipRecordPadding(); 455 } 456 457 final byte[] headerBuf = getRecord(); 458 459 if (headerBuf == null) { 460 /* hit EOF */ 461 currEntry = null; 462 return null; 463 } 464 465 try { 466 currEntry = new TarArchiveEntry(globalPaxHeaders, headerBuf, zipEncoding, lenient); 467 } catch (final IllegalArgumentException e) { 468 throw new IOException("Error detected parsing the header", e); 469 } 470 471 entryOffset = 0; 472 entrySize = currEntry.getSize(); 473 474 if (currEntry.isGNULongLinkEntry()) { 475 final byte[] longLinkData = getLongNameData(); 476 if (longLinkData == null) { 477 // Bugzilla: 40334 478 // Malformed tar file - long link entry name not followed by 479 // entry 480 return null; 481 } 482 currEntry.setLinkName(zipEncoding.decode(longLinkData)); 483 } 484 485 if (currEntry.isGNULongNameEntry()) { 486 final byte[] longNameData = getLongNameData(); 487 if (longNameData == null) { 488 // Bugzilla: 40334 489 // Malformed tar file - long entry name not followed by 490 // entry 491 return null; 492 } 493 494 // COMPRESS-509 : the name of directories should end with '/' 495 final String name = zipEncoding.decode(longNameData); 496 currEntry.setName(name); 497 if (currEntry.isDirectory() && !name.endsWith("/")) { 498 currEntry.setName(name + "/"); 499 } 500 } 501 502 if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers 503 readGlobalPaxHeaders(); 504 } 505 506 try { 507 if (currEntry.isPaxHeader()){ // Process Pax headers 508 paxHeaders(); 509 } else if (!globalPaxHeaders.isEmpty()) { 510 applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders); 511 } 512 } catch (final NumberFormatException e) { 513 throw new IOException("Error detected parsing the pax header", e); 514 } 515 516 if (currEntry.isOldGNUSparse()){ // Process sparse files 517 readOldGNUSparse(); 518 } 519 520 // If the size of the next element in the archive has changed 521 // due to a new size being reported in the posix header 522 // information, we update entrySize here so that it contains 523 // the correct value. 524 entrySize = currEntry.getSize(); 525 526 return currEntry; 527 } 528 529 /** 530 * Gets the next record in this tar archive. This will skip 531 * over any remaining data in the current entry, if there 532 * is one, and place the input stream at the header of the 533 * next entry. 534 * 535 * <p>If there are no more entries in the archive, null will be 536 * returned to indicate that the end of the archive has been 537 * reached. At the same time the {@code hasHitEOF} marker will be 538 * set to true.</p> 539 * 540 * @return The next header in the archive, or null. 541 * @throws IOException on error 542 */ 543 private byte[] getRecord() throws IOException { 544 byte[] headerBuf = readRecord(); 545 setAtEOF(isEOFRecord(headerBuf)); 546 if (isAtEOF() && headerBuf != null) { 547 tryToConsumeSecondEOFRecord(); 548 consumeRemainderOfLastBlock(); 549 headerBuf = null; 550 } 551 return headerBuf; 552 } 553 554 /** 555 * Gets the record size being used by this stream's buffer. 556 * 557 * @return The TarBuffer record size. 558 */ 559 public int getRecordSize() { 560 return recordSize; 561 } 562 563 protected final boolean isAtEOF() { 564 return hasHitEOF; 565 } 566 567 private boolean isDirectory() { 568 return currEntry != null && currEntry.isDirectory(); 569 } 570 571 /** 572 * Determine if an archive record indicate End of Archive. End of 573 * archive is indicated by a record that consists entirely of null bytes. 574 * 575 * @param record The record data to check. 576 * @return true if the record data is an End of Archive 577 */ 578 protected boolean isEOFRecord(final byte[] record) { 579 return record == null || ArchiveUtils.isArrayZero(record, recordSize); 580 } 581 582 /** 583 * Since we do not support marking just yet, we do nothing. 584 * 585 * @param markLimit The limit to mark. 586 */ 587 @Override 588 public synchronized void mark(final int markLimit) { 589 } 590 591 /** 592 * Since we do not support marking just yet, we return false. 593 * 594 * @return False. 595 */ 596 @Override 597 public boolean markSupported() { 598 return false; 599 } 600 601 /** 602 * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) 603 * may appear multi times, and they look like: 604 * 605 * GNU.sparse.size=size 606 * GNU.sparse.numblocks=numblocks 607 * repeat numblocks times 608 * GNU.sparse.offset=offset 609 * GNU.sparse.numbytes=numbytes 610 * end repeat 611 * 612 * 613 * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map 614 * 615 * GNU.sparse.map 616 * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]" 617 * 618 * 619 * For PAX Format 1.X: 620 * The sparse map itself is stored in the file data block, preceding the actual file data. 621 * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary. 622 * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers 623 * giving the offset and size of the data block it describes. 624 * @throws IOException 625 */ 626 private void paxHeaders() throws IOException { 627 List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>(); 628 final Map<String, String> headers = TarUtils.parsePaxHeaders(this, sparseHeaders, globalPaxHeaders, entrySize); 629 630 // for 0.1 PAX Headers 631 if (headers.containsKey(TarGnuSparseKeys.MAP)) { 632 sparseHeaders = new ArrayList<>(TarUtils.parseFromPAX01SparseHeaders(headers.get(TarGnuSparseKeys.MAP))); 633 } 634 getNextEntry(); // Get the actual file entry 635 if (currEntry == null) { 636 throw new IOException("premature end of tar archive. Didn't find any entry after PAX header."); 637 } 638 applyPaxHeadersToCurrentEntry(headers, sparseHeaders); 639 640 // for 1.0 PAX Format, the sparse map is stored in the file data block 641 if (currEntry.isPaxGNU1XSparse()) { 642 sparseHeaders = TarUtils.parsePAX1XSparseHeaders(inputStream, recordSize); 643 currEntry.setSparseHeaders(sparseHeaders); 644 } 645 646 // sparse headers are all done reading, we need to build 647 // sparse input streams using these sparse headers 648 buildSparseInputStreams(); 649 } 650 651 /** 652 * Reads bytes from the current tar archive entry. 653 * 654 * This method is aware of the boundaries of the current 655 * entry in the archive and will deal with them as if they 656 * were this stream's start and EOF. 657 * 658 * @param buf The buffer into which to place bytes read. 659 * @param offset The offset at which to place bytes read. 660 * @param numToRead The number of bytes to read. 661 * @return The number of bytes read, or -1 at EOF. 662 * @throws IOException on error 663 */ 664 @Override 665 public int read(final byte[] buf, final int offset, int numToRead) throws IOException { 666 if (numToRead == 0) { 667 return 0; 668 } 669 int totalRead = 0; 670 671 if (isAtEOF() || isDirectory()) { 672 return -1; 673 } 674 675 if (currEntry == null) { 676 throw new IllegalStateException("No current tar entry"); 677 } 678 679 if (entryOffset >= currEntry.getRealSize()) { 680 return -1; 681 } 682 683 numToRead = Math.min(numToRead, available()); 684 685 if (currEntry.isSparse()) { 686 // for sparse entries, we need to read them in another way 687 totalRead = readSparse(buf, offset, numToRead); 688 } else { 689 totalRead = inputStream.read(buf, offset, numToRead); 690 } 691 692 if (totalRead == -1) { 693 if (numToRead > 0) { 694 throw new IOException("Truncated TAR archive"); 695 } 696 setAtEOF(true); 697 } else { 698 count(totalRead); 699 entryOffset += totalRead; 700 } 701 702 return totalRead; 703 } 704 705 private void readGlobalPaxHeaders() throws IOException { 706 globalPaxHeaders = TarUtils.parsePaxHeaders(this, globalSparseHeaders, globalPaxHeaders, entrySize); 707 getNextEntry(); // Get the actual file entry 708 709 if (currEntry == null) { 710 throw new IOException("Error detected parsing the pax header"); 711 } 712 } 713 714 /** 715 * Adds the sparse chunks from the current entry to the sparse chunks, 716 * including any additional sparse entries following the current entry. 717 * 718 * @throws IOException on error 719 */ 720 private void readOldGNUSparse() throws IOException { 721 if (currEntry.isExtended()) { 722 TarArchiveSparseEntry entry; 723 do { 724 final byte[] headerBuf = getRecord(); 725 if (headerBuf == null) { 726 throw new IOException("premature end of tar archive. Didn't find extended_header after header with extended flag."); 727 } 728 entry = new TarArchiveSparseEntry(headerBuf); 729 currEntry.getSparseHeaders().addAll(entry.getSparseHeaders()); 730 } while (entry.isExtended()); 731 } 732 733 // sparse headers are all done reading, we need to build 734 // sparse input streams using these sparse headers 735 buildSparseInputStreams(); 736 } 737 738 /** 739 * Read a record from the input stream and return the data. 740 * 741 * @return The record data or null if EOF has been hit. 742 * @throws IOException on error 743 */ 744 protected byte[] readRecord() throws IOException { 745 final int readNow = IOUtils.readFully(inputStream, recordBuffer); 746 count(readNow); 747 if (readNow != recordSize) { 748 return null; 749 } 750 751 return recordBuffer; 752 } 753 754 /** 755 * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is 756 * stored in tar files, and they are stored separately. The structure of non-zero data is introduced by the 757 * sparse headers using the offset, where a block of non-zero data starts, and numbytes, the length of the 758 * non-zero data block. 759 * When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together 760 * according to the sparse headers. 761 * 762 * @param buf The buffer into which to place bytes read. 763 * @param offset The offset at which to place bytes read. 764 * @param numToRead The number of bytes to read. 765 * @return The number of bytes read, or -1 at EOF. 766 * @throws IOException on error 767 */ 768 private int readSparse(final byte[] buf, final int offset, final int numToRead) throws IOException { 769 // if there are no actual input streams, just read from the original input stream 770 if (sparseInputStreams == null || sparseInputStreams.isEmpty()) { 771 return inputStream.read(buf, offset, numToRead); 772 } 773 774 if (currentSparseInputStreamIndex >= sparseInputStreams.size()) { 775 return -1; 776 } 777 778 final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex); 779 final int readLen = currentInputStream.read(buf, offset, numToRead); 780 781 // if the current input stream is the last input stream, 782 // just return the number of bytes read from current input stream 783 if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) { 784 return readLen; 785 } 786 787 // if EOF of current input stream is meet, open a new input stream and recursively call read 788 if (readLen == -1) { 789 currentSparseInputStreamIndex++; 790 return readSparse(buf, offset, numToRead); 791 } 792 793 // if the rest data of current input stream is not long enough, open a new input stream 794 // and recursively call read 795 if (readLen < numToRead) { 796 currentSparseInputStreamIndex++; 797 final int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen); 798 if (readLenOfNext == -1) { 799 return readLen; 800 } 801 802 return readLen + readLenOfNext; 803 } 804 805 // if the rest data of current input stream is enough(which means readLen == len), just return readLen 806 return readLen; 807 } 808 809 /** 810 * Since we do not support marking just yet, we do nothing. 811 */ 812 @Override 813 public synchronized void reset() { 814 } 815 816 protected final void setAtEOF(final boolean b) { 817 hasHitEOF = b; 818 } 819 820 protected final void setCurrentEntry(final TarArchiveEntry e) { 821 currEntry = e; 822 } 823 824 /** 825 * Skips over and discards {@code n} bytes of data from this input 826 * stream. The {@code skip} method may, for a variety of reasons, end 827 * up skipping over some smaller number of bytes, possibly {@code 0}. 828 * This may result from any of a number of conditions; reaching end of file 829 * or end of entry before {@code n} bytes have been skipped; are only 830 * two possibilities. The actual number of bytes skipped is returned. If 831 * {@code n} is negative, no bytes are skipped. 832 * 833 * 834 * @param n 835 * the number of bytes to be skipped. 836 * @return the actual number of bytes skipped. 837 * @throws IOException if a truncated tar archive is detected 838 * or some other I/O error occurs 839 */ 840 @Override 841 public long skip(final long n) throws IOException { 842 if (n <= 0 || isDirectory()) { 843 return 0; 844 } 845 846 final long availableOfInputStream = inputStream.available(); 847 final long available = currEntry.getRealSize() - entryOffset; 848 final long numToSkip = Math.min(n, available); 849 long skipped; 850 851 if (!currEntry.isSparse()) { 852 skipped = IOUtils.skip(inputStream, numToSkip); 853 // for non-sparse entry, we should get the bytes actually skipped bytes along with 854 // inputStream.available() if inputStream is instance of FileInputStream 855 skipped = getActuallySkipped(availableOfInputStream, skipped, numToSkip); 856 } else { 857 skipped = skipSparse(numToSkip); 858 } 859 860 861 count(skipped); 862 entryOffset += skipped; 863 return skipped; 864 } 865 866 /** 867 * The last record block should be written at the full size, so skip any 868 * additional space used to fill a record after an entry. 869 * 870 * @throws IOException if a truncated tar archive is detected 871 */ 872 private void skipRecordPadding() throws IOException { 873 if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) { 874 final long available = inputStream.available(); 875 final long numRecords = this.entrySize / this.recordSize + 1; 876 final long padding = numRecords * this.recordSize - this.entrySize; 877 long skipped = IOUtils.skip(inputStream, padding); 878 879 skipped = getActuallySkipped(available, skipped, padding); 880 881 count(skipped); 882 } 883 } 884 885 /** 886 * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip, 887 * jump to the next input stream and skip the rest bytes, keep doing this until total n bytes are skipped 888 * or the input streams are all skipped 889 * 890 * @param n bytes of data to skip 891 * @return actual bytes of data skipped 892 * @throws IOException 893 */ 894 private long skipSparse(final long n) throws IOException { 895 if (sparseInputStreams == null || sparseInputStreams.isEmpty()) { 896 return inputStream.skip(n); 897 } 898 899 long bytesSkipped = 0; 900 901 while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) { 902 final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex); 903 bytesSkipped += currentInputStream.skip(n - bytesSkipped); 904 905 if (bytesSkipped < n) { 906 currentSparseInputStreamIndex++; 907 } 908 } 909 910 return bytesSkipped; 911 } 912 913 /** 914 * Tries to read the next record rewinding the stream if it is not an EOF record. 915 * 916 * <p>This is meant to protect against cases where a tar 917 * implementation has written only one EOF record when two are 918 * expected. Actually this won't help since a non-conforming 919 * implementation likely won't fill full blocks consisting of - by 920 * default - ten records either so we probably have already read 921 * beyond the archive anyway.</p> 922 */ 923 private void tryToConsumeSecondEOFRecord() throws IOException { 924 boolean shouldReset = true; 925 final boolean marked = inputStream.markSupported(); 926 if (marked) { 927 inputStream.mark(recordSize); 928 } 929 try { 930 shouldReset = !isEOFRecord(readRecord()); 931 } finally { 932 if (shouldReset && marked) { 933 pushedBackBytes(recordSize); 934 inputStream.reset(); 935 } 936 } 937 } 938}