feat: completed bytes refactor with cleaned up functionality [2025-03…

…-02]
CHRISCARLON · Mar 2, 2025 · d5b5a83 · d5b5a83
1 parent c3ac361
commit d5b5a83
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 231 deletions.
diff --git a/src/bytes/bytes.rs b/src/bytes/bytes.rs
@@ -0,0 +1,101 @@
+use reqwest::blocking::get;
+use std::fs::File;
+use std::io::Read;
+use crate::processor::Processor;
+
+// Define filetypes
+#[derive(Debug, PartialEq)]
+pub enum FileType {
+    PDF,
+    PNG,
+    JPEG,
+    GIF,
+    ZIP,
+    XLSX,
+    DOCX,
+    XLS,
+    PARQUET,
+    CSV,
+    Unknown,
+}
+
+pub enum Operation {
+    Nibble
+}
+
+pub struct FileBytes {
+    bytes: [u8; 100],
+    operation: Operation,
+}
+
+impl FileBytes {    
+    pub fn from_url(url: &str, operation: Operation) -> Result<Self, Box<dyn std::error::Error>> {
+        let response = get(url)?;
+        let mut buffer = [0u8; 100];
+        response.take(100).read(&mut buffer)?;
+        Ok(FileBytes { bytes: buffer, operation })
+    }
+
+    pub fn from_path(path: &str, operation: Operation) -> Result<Self, Box<dyn std::error::Error>> {
+        let mut file = File::open(path)?;
+        let mut buffer = [0u8; 100];
+        file.read(&mut buffer)?;
+        Ok(FileBytes { bytes: buffer, operation })
+    }
+
+    pub fn identify_type(&self) -> FileType {
+        match &self.bytes {
+            // PDF magic number
+            [0x25, 0x50, 0x44, 0x46, ..] => FileType::PDF,
+            // PNG magic number
+            [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, ..] => FileType::PNG,
+            // JPEG magic number
+            [0xFF, 0xD8, 0xFF, ..] => FileType::JPEG,
+            // GIF magic number
+            [0x47, 0x49, 0x46, 0x38, ..] => FileType::GIF,
+            // ZIP magic number (used for DOCX, XLSX, etc.)
+            [0x50, 0x4B, 0x03, 0x04, rest @ ..] => match rest {
+                [0x14, 0x00, 0x06, 0x00, ..] => FileType::XLSX,
+                [0x14, 0x00, 0x08, 0x00, ..] => FileType::DOCX,
+                _ => FileType::ZIP,
+            },
+            // Parquet magic number (first 4 bytes: PAR1)
+            [0x50, 0x41, 0x52, 0x31, ..] => FileType::PARQUET,
+            // Microsoft Compound File Binary Format (used for XLS, older DOC, etc.)
+            [0xD0, 0xCF, 0x11, 0xE0, ..] => FileType::XLS,
+            // Attempt to detect CSV by checking if the first 100 bytes seem to be comma-separated values
+            _ if self.bytes
+                .iter()
+                .all(|&b| b.is_ascii_alphanumeric() || b == b',' || b == b'\n' || b == b'\r') =>
+            {
+                FileType::CSV
+            }
+            // Default case for unknown file types
+            _ => FileType::Unknown,
+        }
+    }
+
+    pub fn get_bytes(&self) -> &[u8; 100] {
+        &self.bytes
+    }
+}
+
+impl Processor for FileBytes {
+    fn process(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+        match &self.operation {
+            Operation::Nibble => {
+                let file_type = self.identify_type();
+                let bytes = self.get_bytes();
+                println!("File type: {:?}", file_type);
+                println!("First 16 bytes (hex):");
+                for (i, byte) in bytes.iter().take(16).enumerate() {
+                    print!("{:02X} ", byte);
+                    if (i + 1) % 8 == 0 {
+                        println!();
+                    }
+                }
+                Ok(())
+            }
+        }
+    }
+}
diff --git a/src/bytes/mod.rs b/src/bytes/mod.rs
@@ -1,88 +1,2 @@
-use reqwest::blocking::get;
-use std::io::Read;
-
-// Define filetypes
-#[derive(Debug, PartialEq)]
-pub enum FileType {
-    PDF,
-    PNG,
-    JPEG,
-    GIF,
-    ZIP,
-    XLSX,
-    DOCX,
-    XLS,
-    PARQUET,
-    CSV,
-    Unknown,
-}
-
-// Get bytes
-pub fn view_bytes(url: &str) -> Result<([u8; 100], FileType), Box<dyn std::error::Error>> {
-    match get(url) {
-        Ok(response) => {
-            let mut buffer = [0u8; 100];
-            match response.take(100).read(&mut buffer) {
-                Ok(bytes_read) => {
-                    if bytes_read < 100 {
-                        buffer[bytes_read..].fill(0);
-                    }
-                    let file_type = identify_file_type(&buffer);
-                    Ok((buffer, file_type))
-                }
-                Err(e) => Err(Box::new(e)),
-            }
-        }
-        Err(e) => Err(Box::new(e)),
-    }
-}
-
-// Filetype logic
-fn identify_file_type(bytes: &[u8]) -> FileType {
-    match bytes {
-        // PDF magic number
-        [0x25, 0x50, 0x44, 0x46, ..] => FileType::PDF,
-        // PNG magic number
-        [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, ..] => FileType::PNG,
-        // JPEG magic number
-        [0xFF, 0xD8, 0xFF, ..] => FileType::JPEG,
-        // GIF magic number
-        [0x47, 0x49, 0x46, 0x38, ..] => FileType::GIF,
-        // ZIP magic number (used for DOCX, XLSX, etc.)
-        [0x50, 0x4B, 0x03, 0x04, rest @ ..] => match rest {
-            [0x14, 0x00, 0x06, 0x00, ..] => FileType::XLSX,
-            [0x14, 0x00, 0x08, 0x00, ..] => FileType::DOCX,
-            _ => FileType::ZIP,
-        },
-        // Parquet magic number (first 4 bytes: PAR1)
-        [0x50, 0x41, 0x52, 0x31, ..] => FileType::PARQUET,
-        // Microsoft Compound File Binary Format (used for XLS, older DOC, etc.)
-        [0xD0, 0xCF, 0x11, 0xE0, ..] => FileType::XLS,
-        // Attempt to detect CSV by checking if the first 100 bytes seem to be comma-separated values
-        _ if bytes
-            .iter()
-            .all(|&b| b.is_ascii_alphanumeric() || b == b',' || b == b'\n' || b == b'\r') =>
-        {
-            FileType::CSV
-        }
-        // Default case for unknown file types
-        _ => FileType::Unknown,
-    }
-}
-
-// Fields to match on
-pub fn get_file_type_string(file_type: &FileType) -> &'static str {
-    match file_type {
-        FileType::PDF => "PDF",
-        FileType::PNG => "PNG",
-        FileType::JPEG => "JPEG",
-        FileType::GIF => "GIF",
-        FileType::ZIP => "ZIP",
-        FileType::XLSX => "Excel (XLSX)",
-        FileType::DOCX => "Word (DOCX)",
-        FileType::XLS => "Excel (XLS)",
-        FileType::PARQUET => "Parquet",
-        FileType::CSV => "CSV",
-        FileType::Unknown => "Unknown",
-    }
-}
+pub mod bytes;
+pub use bytes::*;
diff --git a/src/main.rs b/src/main.rs
@@ -7,7 +7,7 @@ mod utils;
 mod parquet;
 mod processor;
 use json::JsonResponse;
-use bytes::{get_file_type_string, view_bytes};
+use bytes::FileBytes;
 use clap::{Parser, Subcommand};
 use csv::{fetch_remote_csv, process_basic_csv};
 use delta_lake::{get_aws_config, load_remote_delta_lake_table_info};
@@ -70,7 +70,10 @@ enum Commands {
     /// Check bytes of any file
     Nibble {
         /// Url of the file
-        url: String,
+        path: String,
+        /// Path to the local file
+        #[arg(short, long, help = "If true, the file will be processed locally")]
+        local: bool,
     },
     /// Basic CSV feature
     BasicCsv { url: String },
@@ -143,7 +146,18 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             pb.finish_with_message("JSON Processed");
             result
         },
-        Commands::Nibble { url } => process_view_bytes(url),
+        Commands::Nibble { path, local  } => {
+            let pb = create_progress_bar("Processing Nibble...");
+            let result = match local {
+                true => FileBytes::from_path(path, bytes::Operation::Nibble)?.process(),
+                false => {
+                    validate_url(path)?;
+                    FileBytes::from_url(path, bytes::Operation::Nibble)?.process()
+                }
+            };
+            pb.finish_with_message("Nibble Processed");
+            result
+        },
         Commands::BasicCsv { url } => process_csv(url),
         Commands::DeltaLake { s3_uri } => process_delta_lake(s3_uri).await,
         Commands::BasicParquet { path } => {
@@ -161,24 +175,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 // TODO: add local flag to all commands that could support it
 // TODO: combine all the process_ functions into one
 
-fn process_view_bytes(url: &str) -> Result<(), Box<dyn std::error::Error>> {
-    validate_url(url)?;
-
-    let pb = create_progress_bar("Viewing first 100 bytes...");
-    let (bytes, file_type) = view_bytes(url)?;
-    pb.finish_and_clear();
-
-    println!("First 100 bytes:");
-    for (i, byte) in bytes.iter().enumerate() {
-        print!("{:02X} ", byte);
-        if (i + 1) % 16 == 0 {
-            println!();
-        }
-    }
-    println!("\nDetected file type: {}", get_file_type_string(&file_type));
-    Ok(())
-}
-
 fn process_csv(url: &str) -> Result<(), Box<dyn std::error::Error>> {
     validate_url(url)?;
 

diff --git a/tests/bytes_tests.rs b/tests/bytes_tests.rs
@@ -1,131 +1,15 @@
-use nebby::bytes::{view_bytes, FileType};
-
-#[test]
-fn test_view_bytes() {
-    let url = "https://api.carbonintensity.org.uk/regional/regionid/1";
-
-    let result = view_bytes(url);
-
-    assert!(result.is_ok(), "Failed to read in bytes");
-}
+use nebby::bytes::FileBytes;
+use nebby::bytes::Operation;
+use nebby::bytes::FileType;
 
 #[test]
 fn test_pdf_file() {
     let url = "https://data.london.gov.uk/download/lfb-financial-and-performance-reporting-2024-25/091d06f8-cfad-4286-9263-45351822bb50/LFB%20KPI%20report%20-%20data%20up%20to%202024.06%20%28July%20report%29%20V3.2%20FB.pdf";
 
-    let result = view_bytes(url);
-
-    assert!(result.is_ok(), "Failed to read in bytes");
-
-    let (_, file_type) = result.unwrap();
-    assert_eq!(file_type, FileType::PDF, "Expected PDF file type");
-}
-
-// #[test]
-// fn test_png_file() {
-//     let url =
-//         "https://www.freepik.com/free-photo/draught-beer-png-mug_13299896.htm#query=png&position=0&from_view=keyword&track=ais_hybrid&uuid=49669f7d-f2c1-42b1-b2db-f28289d33b25";
-
-//     let result = view_bytes(url);
-
-//     assert!(result.is_ok(), "Failed to read in bytes");
-
-//     let (_, file_type) = result.unwrap();
-//     assert_eq!(file_type, FileType::PNG, "Expected PNG file type");
-// }
-
-// #[test]
-// fn test_jpeg_file() {
-//     let url = "https://www.freepik.com/free-photo/transparent-colourful-autumn-leaves_5286206.htm#query=jpeg&position=20&from_view=keyword&track=ais_hybrid&uuid=169c2d48-7211-46d0-8097-8ee9cea5a8e4";
-
-//     let result = view_bytes(url);
-
-//     assert!(result.is_ok(), "Failed to read in bytes");
-
-//     let (_, file_type) = result.unwrap();
-//     assert_eq!(file_type, FileType::JPEG, "Expected JPEG file type");
-// }
-
-// #[test]
-// fn test_gif_file() {
-//     let url = "https://www.w3.org/People/mimasa/test/imgformat/img/w3c_home.gif";
-
-//     let result = view_bytes(url);
-
-//     assert!(result.is_ok(), "Failed to read in bytes");
-
-//     let (_, file_type) = result.unwrap();
-//     assert_eq!(file_type, FileType::GIF, "Expected GIF file type");
-// }
-
-#[test]
-fn test_zip_file() {
-    let url = "https://data.london.gov.uk/download/trend-based-population-projections/f042e4c5-9365-4d88-9a48-66a2158a2873/2021-based%20trend%20projections.zip";
-
-    let result = view_bytes(url);
-
-    assert!(result.is_ok(), "Failed to read in bytes");
-
-    let (_, file_type) = result.unwrap();
-    assert_eq!(file_type, FileType::ZIP, "Expected ZIP file type");
-}
-
-// #[test]
-// fn test_xlsx_file() {
-//     let url = "https://datamillnorth.org/download/2o13g/8n0/February%202025%20HMO%20public%20register.xlsx";
-
-//     let result = view_bytes(url);
-
-//     assert!(result.is_ok(), "Failed to read in bytes");
-
-//     let (_, file_type) = result.unwrap();
-//     assert_eq!(file_type, FileType::XLSX, "Expected XLSX file type");
-// }
-
-// #[test]
-// fn test_docx_file() {
-//     let url = "https://data.london.gov.uk/download/kingston-upon-thames-reduction-and-recycling-plan/91f911b4-c0eb-4725-8640-a37eaf72c9d8/RBK%20RRP%20-%20July%202024%20update.docx";
-
-//     let result = view_bytes(url);
-
-//     assert!(result.is_ok(), "Failed to read in bytes");
-
-//     let (_, file_type) = result.unwrap();
-//     assert_eq!(file_type, FileType::DOCX, "Expected DOCX file type");
-// }
-
-#[test]
-fn test_xls_file() {
-    let url = "https://data.london.gov.uk/download/diversity-london-report-data/4090b383-a418-4e2b-8592-75334244cbc7/diversity-in-london-data-2003.xls";
-
-    let result = view_bytes(url);
+    let result = FileBytes::from_url(url, Operation::Nibble);
 
     assert!(result.is_ok(), "Failed to read in bytes");
 
-    let (_, file_type) = result.unwrap();
-    assert_eq!(file_type, FileType::XLS, "Expected XLS file type");
+    let file_bytes = result.unwrap();
+    assert_eq!(file_bytes.identify_type(), FileType::PDF, "Expected PDF file type");
 }
-
-// #[test]
-// fn test_parquet_file() {
-//     let url = "https://github.com/apache/parquet-testing/blob/master/data/alltypes_plain.parquet?raw=true";
-
-//     let result = view_bytes(url);
-
-//     assert!(result.is_ok(), "Failed to read in bytes");
-
-//     let (_, file_type) = result.unwrap();
-//     assert_eq!(file_type, FileType::PARQUET, "Expected Parquet file type");
-// }
-
-// #[test]
-// fn test_csv_file() {
-//     let url = "https://data.london.gov.uk/download/mpsantisocialbehaviour/5ee9e479-f719-4788-a233-ec26a295805f/MPS_Antisocial_Behaviour.csv";
-
-//     let result = view_bytes(url);
-
-//     assert!(result.is_ok(), "Failed to read in bytes");
-
-//     let (_, file_type) = result.unwrap();
-//     assert_eq!(file_type, FileType::CSV, "Expected CSV file type");
-// }