-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
Read a single file from an archive #271
Comments
From my research I find that zip files have a central directory at the end of the file that can be read to find where the separate files are located. Remote files on zenodo can be partially read using the |
This is an other nice blog about the process: https://www.djmannion.net/partial_zip/index.html |
I have played around a bit and parsing a remote zip file is not so difficult the following code reads data from a remote zip over http. There is still plenty of room for improvements but files can already be read within seconds require(httr2)
#> Loading required package: httr2
system.time({
get_cd <- function(x = "https://zenodo.org/records/10671148/files/pilot2.zip") {
end <- request(x) |>
req_headers(Range = "bytes=-22") |>
req_perform() |>
purrr::chuck("body")
cd_start <- end[17:20] |>
rawToBits() |>
packBits("integer")
cd_len <- end[13:16] |>
rawToBits() |>
packBits("integer")
header <- request(x) |>
req_headers(Range = glue::glue("bytes={cd_start}-{cd_start+cd_len+22-1}")) |>
req_perform() |>
purrr::chuck("body")
return(header)
}
raw2ToInt <- function(x) {
c(x, as.raw(0x00), as.raw(0x00)) |>
rawToBits() |>
packBits("integer")
}
dd <- get_cd()
parsecd <- function(x) {
deparse <- c(
signature = 4, version_made_by = 2, version_need_to_extract = 2, bit_flag = 2,
compression_method = 2, last_mod_time = 2, last_mod_date = 2, crc32 = 4, compressed_size = 4, uncompressed_size = 4,
filename_length = 2, extra_field_length = 2, file_comment_length = 2, disk_num = 2, int_file_attr = 2, ext_file_attr = 4, rel_offset = 4
)
deparse <- unlist(purrr::map2(names(deparse), deparse, ~ rep(.x, each = .y)))
res <- list()
while (all(head(x, 4) == as.raw(c(0x50, 0x4b, 0x01, 0x02)))) {
l <- split(head(x, length(deparse)), deparse)
x <- tail(x, -length(deparse))
filename_length_int <- raw2ToInt(l$filename_length)
extra_field_length_int <- raw2ToInt(l$extra_field_length)
file_comment_length_int <- raw2ToInt(l$file_comment_length)
l[["filename"]] <- head(x, filename_length_int)
x <- tail(x, -filename_length_int)
l[["extra_field"]] <- head(x, extra_field_length_int)
if (extra_field_length_int != 0) {
x <- tail(x, -extra_field_length_int)
}
l[["file_comment"]] <- head(x, file_comment_length_int)
if (file_comment_length_int != 0) {
x <- tail(x, -file_comment_length_int)
}
# res <- c(res, list(tibble::tibble( lapply(l, list))))
res <- c(res, list(structure(lapply(l, list), row.names = c(
NA,
-1L
), class = "data.frame")))
}
rr <- dplyr::bind_rows(res) |>
dplyr::mutate(
filename = purrr::map_chr(filename, rawToChar),
rel_offset = purrr::map_int(rel_offset, ~ packBits(rawToBits(.x), "integer")),
compressed_size = purrr::map_int(compressed_size, ~ packBits(rawToBits(.x), "integer")),
uncompressed_size = purrr::map_int(uncompressed_size, ~ packBits(rawToBits(.x), "integer"))
)
rr
}
rr <- parsecd(dd)
rr |>
dplyr::mutate(next_rel_offset = dplyr::lead(rel_offset)) |>
dplyr::filter(grepl(pat = "media.csv", filename)) -> file
depcsv <- request("https://zenodo.org/records/10671148/files/pilot2.zip") |>
req_headers(Range = glue::glue("bytes={file$rel_offset}-{file$next_rel_offset-1}")) |>
req_perform() |>
purrr::chuck("body")
deparself <- c(
signature = 4, version_need_to_extract = 2, bit_flag = 2,
compression_method = 2, last_mod_time = 2, last_mod_date = 2,
crc32 = 4, compressed_size = 4, uncompressed_size = 4,
filename_length = 2, extra_field_length = 2
)
l <- list()
for (i in names(deparself)) {
l[[i]] <- head(depcsv, deparself[i])
depcsv <- tail(depcsv, -deparself[i])
}
filename_length_int <- raw2ToInt(l$filename_length)
extra_field_length_int <- raw2ToInt(l$extra_field_length)
l[["filename"]] <- head(depcsv, filename_length_int)
depcsv <- tail(depcsv, -filename_length_int)
l[["extra_field"]] <- head(depcsv, extra_field_length_int)
if (extra_field_length_int != 0) {
depcsv <- tail(depcsv, -extra_field_length_int)
}
c(as.raw(0x78), as.raw(0x01), depcsv) |>
zip::inflate() |>
purrr::chuck("output") -> rawInflated
a <- vroom::vroom(rawConnection(rawInflated))
})
#> Rows: 365 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): mediaID, deploymentID, captureMethod, filePath, fileName, fileMedi...
#> lgl (4): filePublic, exifData, favorite, mediaComments
#> dttm (1): timestamp
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> user system elapsed
#> 2.697 0.053 3.217
dplyr::glimpse(a)
#> Rows: 365
#> Columns: 11
#> $ mediaID <chr> "10b0e4da-ca2d-4026-8574-bff8d15a3dcb", "5974ba99-73ed-4…
#> $ deploymentID <chr> "AWD_1_13082021_pilot 46576a8c-019a-4dd8-852e-86380e0973…
#> $ captureMethod <chr> "activityDetection", "activityDetection", "activityDetec…
#> $ timestamp <dttm> 2021-08-14 00:35:58, 2021-08-14 00:35:59, 2021-08-14 00…
#> $ filePath <chr> "media\\AWD_1_13082021_pilot 46576a8c-019a-4dd8-852e-863…
#> $ filePublic <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
#> $ fileName <chr> "10b0e4da-ca2d-4026-8574-bff8d15a3dcb.JPG", "5974ba99-73…
#> $ fileMediatype <chr> "image/jpeg", "image/jpeg", "image/jpeg", "image/jpeg", …
#> $ exifData <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ favorite <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
#> $ mediaComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, … note this does only support classical zip files and not the larger zip64 |
Bart sent us a message with an example where he was able to read a single
events.csv
from a 10Gb archive very quickly.However, two other files took longer:
In Python he had better luck:
The text was updated successfully, but these errors were encountered: