1
1
//! This module is used to accurately "estimate" the size of a tar archive
2
2
//! created from a given directory structure.
3
- //!
3
+ //!
4
4
//! The tar format
5
5
//! ==============
6
6
//!
166
166
//! -------
167
167
//!
168
168
//! GNU tar does not support sockets, neither the other implementations do.
169
+ //!
170
+ //! Deviations from tar-rs
171
+ //! ======================
172
+ //!
173
+ //! Some behaviors of `tar-rs` are not the same as GNU ones, including:
174
+ //!
175
+ //! - The `./` prefix is stripped by default in `tar-rs`.
176
+ //! - `tar-rs` does not handle hard links.
177
+ //! - `tar-rs` uses blocking factor of 1 instead of 10, thus the entire
178
+ //! archive is padded to 512-byte block.
169
179
170
- use anyhow:: { Context , Result } ;
180
+ use anyhow:: { Context , Result , bail } ;
171
181
use std:: {
172
- collections:: HashMap , env:: { current_dir, set_current_dir} , fs:: { read_link, Metadata } , os:: unix:: fs:: { FileTypeExt , MetadataExt } , path:: { Path , PathBuf }
182
+ collections:: HashMap ,
183
+ env:: { current_dir, set_current_dir} ,
184
+ fs:: read_link,
185
+ os:: unix:: fs:: { FileTypeExt , MetadataExt } ,
186
+ path:: { Path , PathBuf } ,
173
187
} ;
174
188
use walkdir:: WalkDir ;
175
189
176
190
/// The maximum filename length in the tar header.
177
191
const NAME_FIELD_SIZE : usize = 100 ;
178
192
/// The block size.
179
193
const BLOCK_SIZE : u64 = 512 ;
180
- /// The record size is `BLOCKING_FACTOR * BLOCK_SIZE`.
181
- ///
182
- /// The entire tar archive must pad to this size, since it is the unit size of
183
- /// one read/write operation from/to the medium, similar to the `bs=` option
184
- /// in dd(1) utility.
185
- ///
186
- /// The default (compiled in) is 20, thus resulting a minimum 10KiB of tar file.
187
- const BLOCKING_FACTOR : u64 = 20 ;
188
- const RECORD_SIZE : u64 = BLOCKING_FACTOR * BLOCK_SIZE ;
194
+ // The record size is `BLOCKING_FACTOR * BLOCK_SIZE`.
195
+ //
196
+ // The entire tar archive must pad to this size, since it is the unit size of
197
+ // one read/write operation from/to the medium, similar to the `bs=` option
198
+ // in dd(1) utility.
199
+ //
200
+ // The default (compiled in) is 20, thus resulting a minimum 10KiB of tar file.
201
+ //
202
+ // NOTE RECORD_SIZE is given by the get_tar_dir_size().
203
+ // const BLOCKING_FACTOR: u64 = 20;
204
+ // const RECORD_SIZE: u64 = BLOCKING_FACTOR * BLOCK_SIZE;
189
205
190
206
/// Pad the given size to 512-byte sized blocks. Returns the number of blocks.
191
- fn pad_512_blocksize ( size : u64 ) -> u64 {
207
+ fn pad_to_blocksize ( size : u64 ) -> u64 {
192
208
let padded_bl = size. div_ceil ( BLOCK_SIZE ) ;
193
209
let padded_size = padded_bl * BLOCK_SIZE ;
194
210
assert ! (
@@ -200,63 +216,77 @@ fn pad_512_blocksize(size: u64) -> u64 {
200
216
padded_bl
201
217
}
202
218
203
- /// Get the intended size for xattr of a given file.
204
- fn get_xattrs_size ( _file : & dyn AsRef < Path > , _metadata : & Metadata ) -> Result < u64 > {
205
- // To be implemented
206
- Ok ( 0 )
207
- }
208
-
209
219
/// Get the intended size occupied in the tar archive of a given file.
210
- fn get_size_in_blocks ( file : & dyn AsRef < Path > , metadata : & Metadata ) -> Result < u64 > {
211
- let name = file. as_ref ( ) ;
212
- let mut namelen = name. as_os_str ( ) . len ( ) ;
213
- let ftype = metadata. file_type ( ) ;
220
+ fn get_size_in_blocks (
221
+ file : & dyn AsRef < Path > ,
222
+ ino_db : & mut HashMap < u64 , PathBuf > ,
223
+ strip_prefix : bool ,
224
+ detect_hard_links : bool ,
225
+ ) -> Result < u64 > {
226
+ let file = file. as_ref ( ) ;
227
+ let mut namelen = file. as_os_str ( ) . len ( ) ;
214
228
let mut size_in_blocks = 1 ; // Header block
229
+ let metadata = file. metadata ( ) ?;
230
+ let ftype = metadata. file_type ( ) ;
231
+ if detect_hard_links {
232
+ let ino = metadata. ino ( ) ;
233
+ if ino_db. contains_key ( & ino) {
234
+ return Ok ( 1u64 ) ;
235
+ }
236
+ ino_db. insert ( ino, file. to_path_buf ( ) ) ;
237
+ }
238
+ if strip_prefix && file. to_string_lossy ( ) . starts_with ( "./" ) {
239
+ namelen -= 2 ;
240
+ }
215
241
if ftype. is_file ( ) {
216
242
let file_length = metadata. len ( ) ;
217
- size_in_blocks += pad_512_blocksize ( file_length) ;
243
+ size_in_blocks += pad_to_blocksize ( file_length) ;
218
244
} else if ftype. is_dir ( ) {
219
245
// Directory names must end with a slash.
220
- if !name . to_string_lossy ( ) . ends_with ( '/' ) {
246
+ if !file . to_string_lossy ( ) . ends_with ( '/' ) {
221
247
namelen += 1 ;
222
248
}
223
249
} else if ftype. is_symlink ( ) {
224
- // debug!("File {} is a symbolic link", name.display());
225
250
let link_tgt = read_link ( file) ?;
226
- // debug!("This symbol link is linked to {}", &link_tgt.display());
227
251
let link_tgt_len = link_tgt. as_os_str ( ) . len ( ) ;
228
252
if link_tgt_len > NAME_FIELD_SIZE {
229
253
// Here, if the link target has a long name, then there will be
230
254
// additional "file" that contains this long name. The name in
231
255
// its header will be "././@LongLink", and the file type is 'K'
232
256
// indicating that the next file will have a long link target.
233
257
// debug!("This link target exceeds 100 char limit!");
234
- size_in_blocks += 1 + pad_512_blocksize ( link_tgt_len as u64 + 1 ) ;
258
+ size_in_blocks += 1 + pad_to_blocksize ( link_tgt_len as u64 + 1 ) ;
235
259
}
236
260
} else if ftype. is_socket ( ) {
237
261
// info!("File {} is a socket, ignoring.", name.display());
238
262
// tar can't handle sockets.
239
- size_in_blocks = 0 ;
263
+ return Ok ( 0 ) ;
240
264
} else if ftype. is_block_device ( ) || ftype. is_char_device ( ) || ftype. is_fifo ( ) {
241
265
// Do nothing, as we've considered the long names, and they doesn't
242
266
// have "contents" to store - device major:minor numbers are stored
243
267
// in the header.
244
268
} else {
245
- // Unknown file type; skip;
246
- size_in_blocks = 0 ;
269
+ // Unknown file type, skip.
270
+ return Ok ( 0 )
247
271
}
248
272
// Additional blocks used to store the long name, this time it is a
249
273
// null-terminated string.
250
274
if namelen > NAME_FIELD_SIZE {
251
- // debug!("This file exceeds 100 char limit!");
252
- size_in_blocks += 1 + pad_512_blocksize ( namelen as u64 + 1 ) ;
275
+ size_in_blocks += 1 + pad_to_blocksize ( namelen as u64 + 1 ) ;
253
276
} ;
254
- size_in_blocks += get_xattrs_size ( file, metadata) ?;
255
277
// debug!("Reporting as {} blocks", size_in_blocks);
256
278
Ok ( size_in_blocks)
257
279
}
258
280
259
- pub fn get_tar_dir_size ( root : & Path ) -> Result < u64 > {
281
+ pub fn get_tar_dir_size (
282
+ root : & Path ,
283
+ strip_prefix : bool ,
284
+ hardlinks : bool ,
285
+ record_size : u64 ,
286
+ ) -> Result < u64 > {
287
+ if record_size < BLOCK_SIZE || record_size % BLOCK_SIZE != 0 {
288
+ bail ! ( "Record size must be a multiple of {}" , BLOCK_SIZE ) ;
289
+ }
260
290
// A hashmap with inode numbers as the key. Used to detect hard links.
261
291
// Since a hard link is a feature implemented in the filesystem, we
262
292
// can only rely on the inode number's uniqueness across a filesystem
@@ -277,34 +307,15 @@ pub fn get_tar_dir_size(root: &Path) -> Result<u64> {
277
307
for ent in walkdir. into_iter ( ) {
278
308
let ent = ent?;
279
309
let path = ent. path ( ) ;
280
- let metadata = ent. metadata ( ) ?;
281
- let ino = metadata. ino ( ) ;
282
- if ino_hashmap. contains_key ( & ino) {
283
- // info!(
284
- // "File {} is a hard link to {}. Reporting as 1 block in size.",
285
- // path.display(),
286
- // ino_hashmap
287
- // .get(&ino)
288
- // .expect("Unable to find the duplicate")
289
- // .display()
290
- // );
291
- total_size_in_blks += 1 ;
292
- continue ;
293
- }
294
- ino_hashmap. insert ( ino, path. to_path_buf ( ) ) ;
295
- total_size_in_blks += get_size_in_blocks ( & path, & metadata) ?;
310
+ total_size_in_blks += get_size_in_blocks ( & path, & mut ino_hashmap, strip_prefix, hardlinks) ?;
296
311
}
297
312
298
313
set_current_dir ( cwd) ?;
314
+ // GNU tar has 1024 bytes of zeros as the EOF marker.
299
315
let total_size_in_bytes = total_size_in_blks * BLOCK_SIZE + 1024 ;
300
- let padded_records = total_size_in_bytes. div_ceil ( RECORD_SIZE ) ;
301
- let padded = padded_records * RECORD_SIZE ;
302
- // println!(
303
- // "Total estimated tar size: {} bytes ({}) in {} records",
304
- // padded,
305
- // ByteSize::b(padded),
306
- // padded_records
307
- // );
316
+ // Pad the size to the record size.
317
+ let padded_records = total_size_in_bytes. div_ceil ( record_size) ;
318
+ let padded = padded_records * record_size;
308
319
309
320
Ok ( padded)
310
321
}
0 commit comments