Skip to content

Commit 8dbfe8b

Browse files
committedMar 24, 2025
tar_dir_size: match the behavior of tar-rs; minor fixes
- tar-rs strips `./' prefix from path names. - tar-rs does not treat hard links specifically, they are regular files. - tar-rs does not use 10K record size. - move hard links' logic into the get_size_in_blocks(). - adjust the signature of get_tar_dir_size() accordingly.
1 parent 058538a commit 8dbfe8b

File tree

2 files changed

+72
-61
lines changed

2 files changed

+72
-61
lines changed
 

‎src/fs.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ pub fn archive_xz_tarball(
7272
let f = File::create(target)?;
7373
let xz = build_xz_encoder(threads)?;
7474

75-
let pb = create_progress_bar(get_tar_dir_size(root)?, no_progressbar)?;
75+
let pb = create_progress_bar(get_tar_dir_size(root, true, false, 512)?, no_progressbar)?;
7676

7777
let builder = build_tarball_stream(pb.wrap_write(XzEncoder::new_stream(f, xz)), root)?;
7878

@@ -87,7 +87,7 @@ pub fn archive_xz_tarball(
8787
pub fn archive_gz_tarball(root: &Path, target: &Path, no_progressbar: bool) -> Result<()> {
8888
let f = File::create(target)?;
8989

90-
let pb = create_progress_bar(get_tar_dir_size(root)?, no_progressbar)?;
90+
let pb = create_progress_bar(get_tar_dir_size(root, true, false, 512)?, no_progressbar)?;
9191

9292
let builder =
9393
build_tarball_stream(pb.wrap_write(GzEncoder::new(f, Compression::best())), root)?;

‎src/tar_dir_size.rs

+70-59
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//! This module is used to accurately "estimate" the size of a tar archive
22
//! created from a given directory structure.
3-
//!
3+
//!
44
//! The tar format
55
//! ==============
66
//!
@@ -166,29 +166,45 @@
166166
//! -------
167167
//!
168168
//! GNU tar does not support sockets, neither the other implementations do.
169+
//!
170+
//! Deviations from tar-rs
171+
//! ======================
172+
//!
173+
//! Some behaviors of `tar-rs` are not the same as GNU ones, including:
174+
//!
175+
//! - The `./` prefix is stripped by default in `tar-rs`.
176+
//! - `tar-rs` does not handle hard links.
177+
//! - `tar-rs` uses blocking factor of 1 instead of 10, thus the entire
178+
//! archive is padded to 512-byte block.
169179
170-
use anyhow::{Context, Result};
180+
use anyhow::{Context, Result, bail};
171181
use std::{
172-
collections::HashMap, env::{current_dir, set_current_dir}, fs::{read_link, Metadata}, os::unix::fs::{FileTypeExt, MetadataExt}, path::{Path, PathBuf}
182+
collections::HashMap,
183+
env::{current_dir, set_current_dir},
184+
fs::read_link,
185+
os::unix::fs::{FileTypeExt, MetadataExt},
186+
path::{Path, PathBuf},
173187
};
174188
use walkdir::WalkDir;
175189

176190
/// The maximum filename length in the tar header.
177191
const NAME_FIELD_SIZE: usize = 100;
178192
/// The block size.
179193
const BLOCK_SIZE: u64 = 512;
180-
/// The record size is `BLOCKING_FACTOR * BLOCK_SIZE`.
181-
///
182-
/// The entire tar archive must pad to this size, since it is the unit size of
183-
/// one read/write operation from/to the medium, similar to the `bs=` option
184-
/// in dd(1) utility.
185-
///
186-
/// The default (compiled in) is 20, thus resulting a minimum 10KiB of tar file.
187-
const BLOCKING_FACTOR: u64 = 20;
188-
const RECORD_SIZE: u64 = BLOCKING_FACTOR * BLOCK_SIZE;
194+
// The record size is `BLOCKING_FACTOR * BLOCK_SIZE`.
195+
//
196+
// The entire tar archive must pad to this size, since it is the unit size of
197+
// one read/write operation from/to the medium, similar to the `bs=` option
198+
// in dd(1) utility.
199+
//
200+
// The default (compiled in) is 20, thus resulting a minimum 10KiB of tar file.
201+
//
202+
// NOTE RECORD_SIZE is given by the get_tar_dir_size().
203+
// const BLOCKING_FACTOR: u64 = 20;
204+
// const RECORD_SIZE: u64 = BLOCKING_FACTOR * BLOCK_SIZE;
189205

190206
/// Pad the given size to 512-byte sized blocks. Returns the number of blocks.
191-
fn pad_512_blocksize(size: u64) -> u64 {
207+
fn pad_to_blocksize(size: u64) -> u64 {
192208
let padded_bl = size.div_ceil(BLOCK_SIZE);
193209
let padded_size = padded_bl * BLOCK_SIZE;
194210
assert!(
@@ -200,63 +216,77 @@ fn pad_512_blocksize(size: u64) -> u64 {
200216
padded_bl
201217
}
202218

203-
/// Get the intended size for xattr of a given file.
204-
fn get_xattrs_size(_file: &dyn AsRef<Path>, _metadata: &Metadata) -> Result<u64> {
205-
// To be implemented
206-
Ok(0)
207-
}
208-
209219
/// Get the intended size occupied in the tar archive of a given file.
210-
fn get_size_in_blocks(file: &dyn AsRef<Path>, metadata: &Metadata) -> Result<u64> {
211-
let name = file.as_ref();
212-
let mut namelen = name.as_os_str().len();
213-
let ftype = metadata.file_type();
220+
fn get_size_in_blocks(
221+
file: &dyn AsRef<Path>,
222+
ino_db: &mut HashMap<u64, PathBuf>,
223+
strip_prefix: bool,
224+
detect_hard_links: bool,
225+
) -> Result<u64> {
226+
let file = file.as_ref();
227+
let mut namelen = file.as_os_str().len();
214228
let mut size_in_blocks = 1; // Header block
229+
let metadata = file.metadata()?;
230+
let ftype = metadata.file_type();
231+
if detect_hard_links {
232+
let ino = metadata.ino();
233+
if ino_db.contains_key(&ino) {
234+
return Ok(1u64);
235+
}
236+
ino_db.insert(ino, file.to_path_buf());
237+
}
238+
if strip_prefix && file.to_string_lossy().starts_with("./") {
239+
namelen -= 2;
240+
}
215241
if ftype.is_file() {
216242
let file_length = metadata.len();
217-
size_in_blocks += pad_512_blocksize(file_length);
243+
size_in_blocks += pad_to_blocksize(file_length);
218244
} else if ftype.is_dir() {
219245
// Directory names must end with a slash.
220-
if !name.to_string_lossy().ends_with('/') {
246+
if !file.to_string_lossy().ends_with('/') {
221247
namelen += 1;
222248
}
223249
} else if ftype.is_symlink() {
224-
// debug!("File {} is a symbolic link", name.display());
225250
let link_tgt = read_link(file)?;
226-
// debug!("This symbol link is linked to {}", &link_tgt.display());
227251
let link_tgt_len = link_tgt.as_os_str().len();
228252
if link_tgt_len > NAME_FIELD_SIZE {
229253
// Here, if the link target has a long name, then there will be
230254
// additional "file" that contains this long name. The name in
231255
// its header will be "././@LongLink", and the file type is 'K'
232256
// indicating that the next file will have a long link target.
233257
// debug!("This link target exceeds 100 char limit!");
234-
size_in_blocks += 1 + pad_512_blocksize(link_tgt_len as u64 + 1);
258+
size_in_blocks += 1 + pad_to_blocksize(link_tgt_len as u64 + 1);
235259
}
236260
} else if ftype.is_socket() {
237261
// info!("File {} is a socket, ignoring.", name.display());
238262
// tar can't handle sockets.
239-
size_in_blocks = 0;
263+
return Ok(0);
240264
} else if ftype.is_block_device() || ftype.is_char_device() || ftype.is_fifo() {
241265
// Do nothing, as we've considered the long names, and they doesn't
242266
// have "contents" to store - device major:minor numbers are stored
243267
// in the header.
244268
} else {
245-
// Unknown file type; skip;
246-
size_in_blocks = 0;
269+
// Unknown file type, skip.
270+
return Ok(0)
247271
}
248272
// Additional blocks used to store the long name, this time it is a
249273
// null-terminated string.
250274
if namelen > NAME_FIELD_SIZE {
251-
// debug!("This file exceeds 100 char limit!");
252-
size_in_blocks += 1 + pad_512_blocksize(namelen as u64 + 1);
275+
size_in_blocks += 1 + pad_to_blocksize(namelen as u64 + 1);
253276
};
254-
size_in_blocks += get_xattrs_size(file, metadata)?;
255277
// debug!("Reporting as {} blocks", size_in_blocks);
256278
Ok(size_in_blocks)
257279
}
258280

259-
pub fn get_tar_dir_size(root: &Path) -> Result<u64> {
281+
pub fn get_tar_dir_size(
282+
root: &Path,
283+
strip_prefix: bool,
284+
hardlinks: bool,
285+
record_size: u64,
286+
) -> Result<u64> {
287+
if record_size < BLOCK_SIZE || record_size % BLOCK_SIZE != 0 {
288+
bail!("Record size must be a multiple of {}", BLOCK_SIZE);
289+
}
260290
// A hashmap with inode numbers as the key. Used to detect hard links.
261291
// Since a hard link is a feature implemented in the filesystem, we
262292
// can only rely on the inode number's uniqueness across a filesystem
@@ -277,34 +307,15 @@ pub fn get_tar_dir_size(root: &Path) -> Result<u64> {
277307
for ent in walkdir.into_iter() {
278308
let ent = ent?;
279309
let path = ent.path();
280-
let metadata = ent.metadata()?;
281-
let ino = metadata.ino();
282-
if ino_hashmap.contains_key(&ino) {
283-
// info!(
284-
// "File {} is a hard link to {}. Reporting as 1 block in size.",
285-
// path.display(),
286-
// ino_hashmap
287-
// .get(&ino)
288-
// .expect("Unable to find the duplicate")
289-
// .display()
290-
// );
291-
total_size_in_blks += 1;
292-
continue;
293-
}
294-
ino_hashmap.insert(ino, path.to_path_buf());
295-
total_size_in_blks += get_size_in_blocks(&path, &metadata)?;
310+
total_size_in_blks += get_size_in_blocks(&path, &mut ino_hashmap, strip_prefix, hardlinks)?;
296311
}
297312

298313
set_current_dir(cwd)?;
314+
// GNU tar has 1024 bytes of zeros as the EOF marker.
299315
let total_size_in_bytes = total_size_in_blks * BLOCK_SIZE + 1024;
300-
let padded_records = total_size_in_bytes.div_ceil(RECORD_SIZE);
301-
let padded = padded_records * RECORD_SIZE;
302-
// println!(
303-
// "Total estimated tar size: {} bytes ({}) in {} records",
304-
// padded,
305-
// ByteSize::b(padded),
306-
// padded_records
307-
// );
316+
// Pad the size to the record size.
317+
let padded_records = total_size_in_bytes.div_ceil(record_size);
318+
let padded = padded_records * record_size;
308319

309320
Ok(padded)
310321
}

0 commit comments

Comments
 (0)