-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: completed bytes refactor with cleaned up functionality [2025-03…
…-02]
- Loading branch information
1 parent
c3ac361
commit d5b5a83
Showing
4 changed files
with
126 additions
and
231 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
use reqwest::blocking::get; | ||
use std::fs::File; | ||
use std::io::Read; | ||
use crate::processor::Processor; | ||
|
||
// Define filetypes | ||
#[derive(Debug, PartialEq)] | ||
pub enum FileType { | ||
PDF, | ||
PNG, | ||
JPEG, | ||
GIF, | ||
ZIP, | ||
XLSX, | ||
DOCX, | ||
XLS, | ||
PARQUET, | ||
CSV, | ||
Unknown, | ||
} | ||
|
||
pub enum Operation { | ||
Nibble | ||
} | ||
|
||
pub struct FileBytes { | ||
bytes: [u8; 100], | ||
operation: Operation, | ||
} | ||
|
||
impl FileBytes { | ||
pub fn from_url(url: &str, operation: Operation) -> Result<Self, Box<dyn std::error::Error>> { | ||
let response = get(url)?; | ||
let mut buffer = [0u8; 100]; | ||
response.take(100).read(&mut buffer)?; | ||
Ok(FileBytes { bytes: buffer, operation }) | ||
} | ||
|
||
pub fn from_path(path: &str, operation: Operation) -> Result<Self, Box<dyn std::error::Error>> { | ||
let mut file = File::open(path)?; | ||
let mut buffer = [0u8; 100]; | ||
file.read(&mut buffer)?; | ||
Ok(FileBytes { bytes: buffer, operation }) | ||
} | ||
|
||
pub fn identify_type(&self) -> FileType { | ||
match &self.bytes { | ||
// PDF magic number | ||
[0x25, 0x50, 0x44, 0x46, ..] => FileType::PDF, | ||
// PNG magic number | ||
[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, ..] => FileType::PNG, | ||
// JPEG magic number | ||
[0xFF, 0xD8, 0xFF, ..] => FileType::JPEG, | ||
// GIF magic number | ||
[0x47, 0x49, 0x46, 0x38, ..] => FileType::GIF, | ||
// ZIP magic number (used for DOCX, XLSX, etc.) | ||
[0x50, 0x4B, 0x03, 0x04, rest @ ..] => match rest { | ||
[0x14, 0x00, 0x06, 0x00, ..] => FileType::XLSX, | ||
[0x14, 0x00, 0x08, 0x00, ..] => FileType::DOCX, | ||
_ => FileType::ZIP, | ||
}, | ||
// Parquet magic number (first 4 bytes: PAR1) | ||
[0x50, 0x41, 0x52, 0x31, ..] => FileType::PARQUET, | ||
// Microsoft Compound File Binary Format (used for XLS, older DOC, etc.) | ||
[0xD0, 0xCF, 0x11, 0xE0, ..] => FileType::XLS, | ||
// Attempt to detect CSV by checking if the first 100 bytes seem to be comma-separated values | ||
_ if self.bytes | ||
.iter() | ||
.all(|&b| b.is_ascii_alphanumeric() || b == b',' || b == b'\n' || b == b'\r') => | ||
{ | ||
FileType::CSV | ||
} | ||
// Default case for unknown file types | ||
_ => FileType::Unknown, | ||
} | ||
} | ||
|
||
pub fn get_bytes(&self) -> &[u8; 100] { | ||
&self.bytes | ||
} | ||
} | ||
|
||
impl Processor for FileBytes { | ||
fn process(&mut self) -> Result<(), Box<dyn std::error::Error>> { | ||
match &self.operation { | ||
Operation::Nibble => { | ||
let file_type = self.identify_type(); | ||
let bytes = self.get_bytes(); | ||
println!("File type: {:?}", file_type); | ||
println!("First 16 bytes (hex):"); | ||
for (i, byte) in bytes.iter().take(16).enumerate() { | ||
print!("{:02X} ", byte); | ||
if (i + 1) % 8 == 0 { | ||
println!(); | ||
} | ||
} | ||
Ok(()) | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,88 +1,2 @@ | ||
use reqwest::blocking::get; | ||
use std::io::Read; | ||
|
||
// Define filetypes | ||
#[derive(Debug, PartialEq)] | ||
pub enum FileType { | ||
PDF, | ||
PNG, | ||
JPEG, | ||
GIF, | ||
ZIP, | ||
XLSX, | ||
DOCX, | ||
XLS, | ||
PARQUET, | ||
CSV, | ||
Unknown, | ||
} | ||
|
||
// Get bytes | ||
pub fn view_bytes(url: &str) -> Result<([u8; 100], FileType), Box<dyn std::error::Error>> { | ||
match get(url) { | ||
Ok(response) => { | ||
let mut buffer = [0u8; 100]; | ||
match response.take(100).read(&mut buffer) { | ||
Ok(bytes_read) => { | ||
if bytes_read < 100 { | ||
buffer[bytes_read..].fill(0); | ||
} | ||
let file_type = identify_file_type(&buffer); | ||
Ok((buffer, file_type)) | ||
} | ||
Err(e) => Err(Box::new(e)), | ||
} | ||
} | ||
Err(e) => Err(Box::new(e)), | ||
} | ||
} | ||
|
||
// Filetype logic | ||
fn identify_file_type(bytes: &[u8]) -> FileType { | ||
match bytes { | ||
// PDF magic number | ||
[0x25, 0x50, 0x44, 0x46, ..] => FileType::PDF, | ||
// PNG magic number | ||
[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, ..] => FileType::PNG, | ||
// JPEG magic number | ||
[0xFF, 0xD8, 0xFF, ..] => FileType::JPEG, | ||
// GIF magic number | ||
[0x47, 0x49, 0x46, 0x38, ..] => FileType::GIF, | ||
// ZIP magic number (used for DOCX, XLSX, etc.) | ||
[0x50, 0x4B, 0x03, 0x04, rest @ ..] => match rest { | ||
[0x14, 0x00, 0x06, 0x00, ..] => FileType::XLSX, | ||
[0x14, 0x00, 0x08, 0x00, ..] => FileType::DOCX, | ||
_ => FileType::ZIP, | ||
}, | ||
// Parquet magic number (first 4 bytes: PAR1) | ||
[0x50, 0x41, 0x52, 0x31, ..] => FileType::PARQUET, | ||
// Microsoft Compound File Binary Format (used for XLS, older DOC, etc.) | ||
[0xD0, 0xCF, 0x11, 0xE0, ..] => FileType::XLS, | ||
// Attempt to detect CSV by checking if the first 100 bytes seem to be comma-separated values | ||
_ if bytes | ||
.iter() | ||
.all(|&b| b.is_ascii_alphanumeric() || b == b',' || b == b'\n' || b == b'\r') => | ||
{ | ||
FileType::CSV | ||
} | ||
// Default case for unknown file types | ||
_ => FileType::Unknown, | ||
} | ||
} | ||
|
||
// Fields to match on | ||
pub fn get_file_type_string(file_type: &FileType) -> &'static str { | ||
match file_type { | ||
FileType::PDF => "PDF", | ||
FileType::PNG => "PNG", | ||
FileType::JPEG => "JPEG", | ||
FileType::GIF => "GIF", | ||
FileType::ZIP => "ZIP", | ||
FileType::XLSX => "Excel (XLSX)", | ||
FileType::DOCX => "Word (DOCX)", | ||
FileType::XLS => "Excel (XLS)", | ||
FileType::PARQUET => "Parquet", | ||
FileType::CSV => "CSV", | ||
FileType::Unknown => "Unknown", | ||
} | ||
} | ||
pub mod bytes; | ||
pub use bytes::*; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,131 +1,15 @@ | ||
use nebby::bytes::{view_bytes, FileType}; | ||
|
||
#[test] | ||
fn test_view_bytes() { | ||
let url = "https://api.carbonintensity.org.uk/regional/regionid/1"; | ||
|
||
let result = view_bytes(url); | ||
|
||
assert!(result.is_ok(), "Failed to read in bytes"); | ||
} | ||
use nebby::bytes::FileBytes; | ||
use nebby::bytes::Operation; | ||
use nebby::bytes::FileType; | ||
|
||
#[test] | ||
fn test_pdf_file() { | ||
let url = "https://data.london.gov.uk/download/lfb-financial-and-performance-reporting-2024-25/091d06f8-cfad-4286-9263-45351822bb50/LFB%20KPI%20report%20-%20data%20up%20to%202024.06%20%28July%20report%29%20V3.2%20FB.pdf"; | ||
|
||
let result = view_bytes(url); | ||
|
||
assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
let (_, file_type) = result.unwrap(); | ||
assert_eq!(file_type, FileType::PDF, "Expected PDF file type"); | ||
} | ||
|
||
// #[test] | ||
// fn test_png_file() { | ||
// let url = | ||
// "https://www.freepik.com/free-photo/draught-beer-png-mug_13299896.htm#query=png&position=0&from_view=keyword&track=ais_hybrid&uuid=49669f7d-f2c1-42b1-b2db-f28289d33b25"; | ||
|
||
// let result = view_bytes(url); | ||
|
||
// assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
// let (_, file_type) = result.unwrap(); | ||
// assert_eq!(file_type, FileType::PNG, "Expected PNG file type"); | ||
// } | ||
|
||
// #[test] | ||
// fn test_jpeg_file() { | ||
// let url = "https://www.freepik.com/free-photo/transparent-colourful-autumn-leaves_5286206.htm#query=jpeg&position=20&from_view=keyword&track=ais_hybrid&uuid=169c2d48-7211-46d0-8097-8ee9cea5a8e4"; | ||
|
||
// let result = view_bytes(url); | ||
|
||
// assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
// let (_, file_type) = result.unwrap(); | ||
// assert_eq!(file_type, FileType::JPEG, "Expected JPEG file type"); | ||
// } | ||
|
||
// #[test] | ||
// fn test_gif_file() { | ||
// let url = "https://www.w3.org/People/mimasa/test/imgformat/img/w3c_home.gif"; | ||
|
||
// let result = view_bytes(url); | ||
|
||
// assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
// let (_, file_type) = result.unwrap(); | ||
// assert_eq!(file_type, FileType::GIF, "Expected GIF file type"); | ||
// } | ||
|
||
#[test] | ||
fn test_zip_file() { | ||
let url = "https://data.london.gov.uk/download/trend-based-population-projections/f042e4c5-9365-4d88-9a48-66a2158a2873/2021-based%20trend%20projections.zip"; | ||
|
||
let result = view_bytes(url); | ||
|
||
assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
let (_, file_type) = result.unwrap(); | ||
assert_eq!(file_type, FileType::ZIP, "Expected ZIP file type"); | ||
} | ||
|
||
// #[test] | ||
// fn test_xlsx_file() { | ||
// let url = "https://datamillnorth.org/download/2o13g/8n0/February%202025%20HMO%20public%20register.xlsx"; | ||
|
||
// let result = view_bytes(url); | ||
|
||
// assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
// let (_, file_type) = result.unwrap(); | ||
// assert_eq!(file_type, FileType::XLSX, "Expected XLSX file type"); | ||
// } | ||
|
||
// #[test] | ||
// fn test_docx_file() { | ||
// let url = "https://data.london.gov.uk/download/kingston-upon-thames-reduction-and-recycling-plan/91f911b4-c0eb-4725-8640-a37eaf72c9d8/RBK%20RRP%20-%20July%202024%20update.docx"; | ||
|
||
// let result = view_bytes(url); | ||
|
||
// assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
// let (_, file_type) = result.unwrap(); | ||
// assert_eq!(file_type, FileType::DOCX, "Expected DOCX file type"); | ||
// } | ||
|
||
#[test] | ||
fn test_xls_file() { | ||
let url = "https://data.london.gov.uk/download/diversity-london-report-data/4090b383-a418-4e2b-8592-75334244cbc7/diversity-in-london-data-2003.xls"; | ||
|
||
let result = view_bytes(url); | ||
let result = FileBytes::from_url(url, Operation::Nibble); | ||
|
||
assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
let (_, file_type) = result.unwrap(); | ||
assert_eq!(file_type, FileType::XLS, "Expected XLS file type"); | ||
let file_bytes = result.unwrap(); | ||
assert_eq!(file_bytes.identify_type(), FileType::PDF, "Expected PDF file type"); | ||
} | ||
|
||
// #[test] | ||
// fn test_parquet_file() { | ||
// let url = "https://github.com/apache/parquet-testing/blob/master/data/alltypes_plain.parquet?raw=true"; | ||
|
||
// let result = view_bytes(url); | ||
|
||
// assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
// let (_, file_type) = result.unwrap(); | ||
// assert_eq!(file_type, FileType::PARQUET, "Expected Parquet file type"); | ||
// } | ||
|
||
// #[test] | ||
// fn test_csv_file() { | ||
// let url = "https://data.london.gov.uk/download/mpsantisocialbehaviour/5ee9e479-f719-4788-a233-ec26a295805f/MPS_Antisocial_Behaviour.csv"; | ||
|
||
// let result = view_bytes(url); | ||
|
||
// assert!(result.is_ok(), "Failed to read in bytes"); | ||
|
||
// let (_, file_type) = result.unwrap(); | ||
// assert_eq!(file_type, FileType::CSV, "Expected CSV file type"); | ||
// } |