Skip to content

Commit

Permalink
feat: completed bytes refactor with cleaned up functionality [2025-03…
Browse files Browse the repository at this point in the history
…-02]
  • Loading branch information
CHRISCARLON committed Mar 2, 2025
1 parent c3ac361 commit d5b5a83
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 231 deletions.
101 changes: 101 additions & 0 deletions src/bytes/bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
use reqwest::blocking::get;
use std::fs::File;
use std::io::Read;
use crate::processor::Processor;

// Define filetypes
#[derive(Debug, PartialEq)]
pub enum FileType {
PDF,
PNG,
JPEG,
GIF,
ZIP,
XLSX,
DOCX,
XLS,
PARQUET,
CSV,
Unknown,
}

pub enum Operation {
Nibble
}

pub struct FileBytes {
bytes: [u8; 100],
operation: Operation,
}

impl FileBytes {
pub fn from_url(url: &str, operation: Operation) -> Result<Self, Box<dyn std::error::Error>> {
let response = get(url)?;
let mut buffer = [0u8; 100];
response.take(100).read(&mut buffer)?;
Ok(FileBytes { bytes: buffer, operation })
}

pub fn from_path(path: &str, operation: Operation) -> Result<Self, Box<dyn std::error::Error>> {
let mut file = File::open(path)?;
let mut buffer = [0u8; 100];
file.read(&mut buffer)?;
Ok(FileBytes { bytes: buffer, operation })
}

pub fn identify_type(&self) -> FileType {
match &self.bytes {
// PDF magic number
[0x25, 0x50, 0x44, 0x46, ..] => FileType::PDF,
// PNG magic number
[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, ..] => FileType::PNG,
// JPEG magic number
[0xFF, 0xD8, 0xFF, ..] => FileType::JPEG,
// GIF magic number
[0x47, 0x49, 0x46, 0x38, ..] => FileType::GIF,
// ZIP magic number (used for DOCX, XLSX, etc.)
[0x50, 0x4B, 0x03, 0x04, rest @ ..] => match rest {
[0x14, 0x00, 0x06, 0x00, ..] => FileType::XLSX,
[0x14, 0x00, 0x08, 0x00, ..] => FileType::DOCX,
_ => FileType::ZIP,
},
// Parquet magic number (first 4 bytes: PAR1)
[0x50, 0x41, 0x52, 0x31, ..] => FileType::PARQUET,
// Microsoft Compound File Binary Format (used for XLS, older DOC, etc.)
[0xD0, 0xCF, 0x11, 0xE0, ..] => FileType::XLS,
// Attempt to detect CSV by checking if the first 100 bytes seem to be comma-separated values
_ if self.bytes
.iter()
.all(|&b| b.is_ascii_alphanumeric() || b == b',' || b == b'\n' || b == b'\r') =>
{
FileType::CSV
}
// Default case for unknown file types
_ => FileType::Unknown,
}
}

pub fn get_bytes(&self) -> &[u8; 100] {
&self.bytes
}
}

impl Processor for FileBytes {
fn process(&mut self) -> Result<(), Box<dyn std::error::Error>> {
match &self.operation {
Operation::Nibble => {
let file_type = self.identify_type();
let bytes = self.get_bytes();
println!("File type: {:?}", file_type);
println!("First 16 bytes (hex):");
for (i, byte) in bytes.iter().take(16).enumerate() {
print!("{:02X} ", byte);
if (i + 1) % 8 == 0 {
println!();
}
}
Ok(())
}
}
}
}
90 changes: 2 additions & 88 deletions src/bytes/mod.rs
Original file line number Diff line number Diff line change
@@ -1,88 +1,2 @@
use reqwest::blocking::get;
use std::io::Read;

// Define filetypes
#[derive(Debug, PartialEq)]
pub enum FileType {
PDF,
PNG,
JPEG,
GIF,
ZIP,
XLSX,
DOCX,
XLS,
PARQUET,
CSV,
Unknown,
}

// Get bytes
pub fn view_bytes(url: &str) -> Result<([u8; 100], FileType), Box<dyn std::error::Error>> {
match get(url) {
Ok(response) => {
let mut buffer = [0u8; 100];
match response.take(100).read(&mut buffer) {
Ok(bytes_read) => {
if bytes_read < 100 {
buffer[bytes_read..].fill(0);
}
let file_type = identify_file_type(&buffer);
Ok((buffer, file_type))
}
Err(e) => Err(Box::new(e)),
}
}
Err(e) => Err(Box::new(e)),
}
}

// Filetype logic
fn identify_file_type(bytes: &[u8]) -> FileType {
match bytes {
// PDF magic number
[0x25, 0x50, 0x44, 0x46, ..] => FileType::PDF,
// PNG magic number
[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, ..] => FileType::PNG,
// JPEG magic number
[0xFF, 0xD8, 0xFF, ..] => FileType::JPEG,
// GIF magic number
[0x47, 0x49, 0x46, 0x38, ..] => FileType::GIF,
// ZIP magic number (used for DOCX, XLSX, etc.)
[0x50, 0x4B, 0x03, 0x04, rest @ ..] => match rest {
[0x14, 0x00, 0x06, 0x00, ..] => FileType::XLSX,
[0x14, 0x00, 0x08, 0x00, ..] => FileType::DOCX,
_ => FileType::ZIP,
},
// Parquet magic number (first 4 bytes: PAR1)
[0x50, 0x41, 0x52, 0x31, ..] => FileType::PARQUET,
// Microsoft Compound File Binary Format (used for XLS, older DOC, etc.)
[0xD0, 0xCF, 0x11, 0xE0, ..] => FileType::XLS,
// Attempt to detect CSV by checking if the first 100 bytes seem to be comma-separated values
_ if bytes
.iter()
.all(|&b| b.is_ascii_alphanumeric() || b == b',' || b == b'\n' || b == b'\r') =>
{
FileType::CSV
}
// Default case for unknown file types
_ => FileType::Unknown,
}
}

// Fields to match on
pub fn get_file_type_string(file_type: &FileType) -> &'static str {
match file_type {
FileType::PDF => "PDF",
FileType::PNG => "PNG",
FileType::JPEG => "JPEG",
FileType::GIF => "GIF",
FileType::ZIP => "ZIP",
FileType::XLSX => "Excel (XLSX)",
FileType::DOCX => "Word (DOCX)",
FileType::XLS => "Excel (XLS)",
FileType::PARQUET => "Parquet",
FileType::CSV => "CSV",
FileType::Unknown => "Unknown",
}
}
pub mod bytes;
pub use bytes::*;
38 changes: 17 additions & 21 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ mod utils;
mod parquet;
mod processor;
use json::JsonResponse;
use bytes::{get_file_type_string, view_bytes};
use bytes::FileBytes;
use clap::{Parser, Subcommand};
use csv::{fetch_remote_csv, process_basic_csv};
use delta_lake::{get_aws_config, load_remote_delta_lake_table_info};
Expand Down Expand Up @@ -70,7 +70,10 @@ enum Commands {
/// Check bytes of any file
Nibble {
/// Url of the file
url: String,
path: String,
/// Path to the local file
#[arg(short, long, help = "If true, the file will be processed locally")]
local: bool,
},
/// Basic CSV feature
BasicCsv { url: String },
Expand Down Expand Up @@ -143,7 +146,18 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
pb.finish_with_message("JSON Processed");
result
},
Commands::Nibble { url } => process_view_bytes(url),
Commands::Nibble { path, local } => {
let pb = create_progress_bar("Processing Nibble...");
let result = match local {
true => FileBytes::from_path(path, bytes::Operation::Nibble)?.process(),
false => {
validate_url(path)?;
FileBytes::from_url(path, bytes::Operation::Nibble)?.process()
}
};
pb.finish_with_message("Nibble Processed");
result
},
Commands::BasicCsv { url } => process_csv(url),
Commands::DeltaLake { s3_uri } => process_delta_lake(s3_uri).await,
Commands::BasicParquet { path } => {
Expand All @@ -161,24 +175,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// TODO: add local flag to all commands that could support it
// TODO: combine all the process_ functions into one

fn process_view_bytes(url: &str) -> Result<(), Box<dyn std::error::Error>> {
validate_url(url)?;

let pb = create_progress_bar("Viewing first 100 bytes...");
let (bytes, file_type) = view_bytes(url)?;
pb.finish_and_clear();

println!("First 100 bytes:");
for (i, byte) in bytes.iter().enumerate() {
print!("{:02X} ", byte);
if (i + 1) % 16 == 0 {
println!();
}
}
println!("\nDetected file type: {}", get_file_type_string(&file_type));
Ok(())
}

fn process_csv(url: &str) -> Result<(), Box<dyn std::error::Error>> {
validate_url(url)?;

Expand Down
128 changes: 6 additions & 122 deletions tests/bytes_tests.rs
Original file line number Diff line number Diff line change
@@ -1,131 +1,15 @@
use nebby::bytes::{view_bytes, FileType};

#[test]
fn test_view_bytes() {
let url = "https://api.carbonintensity.org.uk/regional/regionid/1";

let result = view_bytes(url);

assert!(result.is_ok(), "Failed to read in bytes");
}
use nebby::bytes::FileBytes;
use nebby::bytes::Operation;
use nebby::bytes::FileType;

#[test]
fn test_pdf_file() {
let url = "https://data.london.gov.uk/download/lfb-financial-and-performance-reporting-2024-25/091d06f8-cfad-4286-9263-45351822bb50/LFB%20KPI%20report%20-%20data%20up%20to%202024.06%20%28July%20report%29%20V3.2%20FB.pdf";

let result = view_bytes(url);

assert!(result.is_ok(), "Failed to read in bytes");

let (_, file_type) = result.unwrap();
assert_eq!(file_type, FileType::PDF, "Expected PDF file type");
}

// #[test]
// fn test_png_file() {
// let url =
// "https://www.freepik.com/free-photo/draught-beer-png-mug_13299896.htm#query=png&position=0&from_view=keyword&track=ais_hybrid&uuid=49669f7d-f2c1-42b1-b2db-f28289d33b25";

// let result = view_bytes(url);

// assert!(result.is_ok(), "Failed to read in bytes");

// let (_, file_type) = result.unwrap();
// assert_eq!(file_type, FileType::PNG, "Expected PNG file type");
// }

// #[test]
// fn test_jpeg_file() {
// let url = "https://www.freepik.com/free-photo/transparent-colourful-autumn-leaves_5286206.htm#query=jpeg&position=20&from_view=keyword&track=ais_hybrid&uuid=169c2d48-7211-46d0-8097-8ee9cea5a8e4";

// let result = view_bytes(url);

// assert!(result.is_ok(), "Failed to read in bytes");

// let (_, file_type) = result.unwrap();
// assert_eq!(file_type, FileType::JPEG, "Expected JPEG file type");
// }

// #[test]
// fn test_gif_file() {
// let url = "https://www.w3.org/People/mimasa/test/imgformat/img/w3c_home.gif";

// let result = view_bytes(url);

// assert!(result.is_ok(), "Failed to read in bytes");

// let (_, file_type) = result.unwrap();
// assert_eq!(file_type, FileType::GIF, "Expected GIF file type");
// }

#[test]
fn test_zip_file() {
let url = "https://data.london.gov.uk/download/trend-based-population-projections/f042e4c5-9365-4d88-9a48-66a2158a2873/2021-based%20trend%20projections.zip";

let result = view_bytes(url);

assert!(result.is_ok(), "Failed to read in bytes");

let (_, file_type) = result.unwrap();
assert_eq!(file_type, FileType::ZIP, "Expected ZIP file type");
}

// #[test]
// fn test_xlsx_file() {
// let url = "https://datamillnorth.org/download/2o13g/8n0/February%202025%20HMO%20public%20register.xlsx";

// let result = view_bytes(url);

// assert!(result.is_ok(), "Failed to read in bytes");

// let (_, file_type) = result.unwrap();
// assert_eq!(file_type, FileType::XLSX, "Expected XLSX file type");
// }

// #[test]
// fn test_docx_file() {
// let url = "https://data.london.gov.uk/download/kingston-upon-thames-reduction-and-recycling-plan/91f911b4-c0eb-4725-8640-a37eaf72c9d8/RBK%20RRP%20-%20July%202024%20update.docx";

// let result = view_bytes(url);

// assert!(result.is_ok(), "Failed to read in bytes");

// let (_, file_type) = result.unwrap();
// assert_eq!(file_type, FileType::DOCX, "Expected DOCX file type");
// }

#[test]
fn test_xls_file() {
let url = "https://data.london.gov.uk/download/diversity-london-report-data/4090b383-a418-4e2b-8592-75334244cbc7/diversity-in-london-data-2003.xls";

let result = view_bytes(url);
let result = FileBytes::from_url(url, Operation::Nibble);

assert!(result.is_ok(), "Failed to read in bytes");

let (_, file_type) = result.unwrap();
assert_eq!(file_type, FileType::XLS, "Expected XLS file type");
let file_bytes = result.unwrap();
assert_eq!(file_bytes.identify_type(), FileType::PDF, "Expected PDF file type");
}

// #[test]
// fn test_parquet_file() {
// let url = "https://github.com/apache/parquet-testing/blob/master/data/alltypes_plain.parquet?raw=true";

// let result = view_bytes(url);

// assert!(result.is_ok(), "Failed to read in bytes");

// let (_, file_type) = result.unwrap();
// assert_eq!(file_type, FileType::PARQUET, "Expected Parquet file type");
// }

// #[test]
// fn test_csv_file() {
// let url = "https://data.london.gov.uk/download/mpsantisocialbehaviour/5ee9e479-f719-4788-a233-ec26a295805f/MPS_Antisocial_Behaviour.csv";

// let result = view_bytes(url);

// assert!(result.is_ok(), "Failed to read in bytes");

// let (_, file_type) = result.unwrap();
// assert_eq!(file_type, FileType::CSV, "Expected CSV file type");
// }

0 comments on commit d5b5a83

Please # to comment.