diff --git a/CHANGELOG.md b/CHANGELOG.md index e721f42301..465579b768 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### New features - Support MMDetection COCO format () +- Develop JsonSectionPageMapper in Rust API + () ### Enhancements - Optimize Python import to make CLI entrypoint faster diff --git a/rust/src/json_section_page_mapper.rs b/rust/src/json_section_page_mapper.rs new file mode 100644 index 0000000000..020cf52c26 --- /dev/null +++ b/rust/src/json_section_page_mapper.rs @@ -0,0 +1,232 @@ +// Copyright (C) 2023 Intel Corporation +// +// SPDX-License-Identifier: MIT + +use crate::{ + page_mapper::{JsonPageMapper, ParsedJsonSection}, + utils::read_skipping_ws, +}; +use pyo3::{prelude::*, types::PyDict}; +use std::{ + collections::HashMap, + fs::File, + io::{self, BufReader, Read, Seek}, + path::Path, +}; + +#[derive(Debug)] +struct JsonSection { + key: String, + offset: usize, + size: usize, +} + +fn handle_arr_or_dict( + mut stack: Vec, + mut reader: impl Read + Seek, + mut last_token: u8, +) -> Result<(), io::Error> { + while stack.len() != 0 { + match read_skipping_ws(&mut reader) { + Ok(c) => match c { + b'{' | b'[' => { + stack.push(c); + last_token = c; + } + b'}' => { + if last_token != b'{' { + let cur_pos = reader.stream_position()?; + let msg = format!("Last token in the stack is '{}', but the given token at offset={} is '}}'", last_token as char, cur_pos); + return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); + } + stack.pop(); + if stack.len() != 0 { + last_token = *stack + .last() + .ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?; + } + } + b']' => { + if last_token != b'[' { + let cur_pos = reader.stream_position()?; + let msg = format!("Last token in the stack is '{}', but the given token at offset={} is ']'", last_token as char, cur_pos); + return Err(io::Error::new(io::ErrorKind::InvalidData, msg)); + } + stack.pop(); + if stack.len() != 0 { + last_token = *stack + .last() + .ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?; + } + } + b'"' => { + while let Ok(c) = read_skipping_ws(&mut reader) { + if c == b'"' { + break; + } + } + } + _ => {} + }, + Err(err) => { + return Err(err); + } + } + } + Ok(()) +} + +fn handle_string(mut reader: impl Read + Seek) -> Result<(), io::Error> { + while let Ok(c) = read_skipping_ws(&mut reader) { + if c == b'"' { + break; + } + } + Ok(()) +} + +fn get_offset(mut reader: impl Read + Seek, stack: &mut Vec) -> Result { + let mut offset = usize::MAX; + while let Ok(c) = read_skipping_ws(&mut reader) { + stack.push(c); + match c { + b'{' | b'[' | b'"' => { + return Ok(reader.stream_position()? as usize - 1); + } + b',' => { + return Ok(offset - 1); + } + _ => { + let pos = reader.stream_position()? as usize; + offset = std::cmp::min(pos, offset); + } + } + } + Err(io::Error::new( + io::ErrorKind::InvalidData, + "Cannot get offset", + )) +} + +impl ParsedJsonSection for JsonSection { + fn parse(buf_key: String, mut reader: impl Read + Seek) -> Result, io::Error> { + // Move reader's cursor right after ':' + while let Ok(c) = read_skipping_ws(&mut reader) { + if c == b':' { + break; + } + } + + let mut stack = vec![]; + + let start_offset = get_offset(&mut reader, &mut stack)?; + + let last_token = *stack + .last() + .ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?; + + let end_offset = match last_token { + b'[' | b'{' => { + let _ = handle_arr_or_dict(stack, &mut reader, last_token)?; + Ok(reader.stream_position()? as usize) + } + b'"' => { + let _ = handle_string(&mut reader)?; + Ok(reader.stream_position()? as usize) + } + b',' => Ok(reader.stream_position()? as usize - 1), + _ => Err(io::Error::new(io::ErrorKind::InvalidData, "s")), + }?; + + let size = end_offset - start_offset; + + Ok(Box::new(JsonSection { + key: buf_key, + offset: start_offset, + size: size, + })) + } +} + +#[derive(Debug)] +pub struct JsonSectionPageMapperImpl { + sections: Vec>, +} + +impl JsonPageMapper for JsonSectionPageMapperImpl {} + +impl JsonSectionPageMapperImpl { + pub fn new(mut reader: impl Read + Seek) -> Result { + let sections = Self::parse_json(&mut reader)?; + + Ok(JsonSectionPageMapperImpl { sections: sections }) + } +} + +#[pyclass] +pub struct JsonSectionPageMapper { + reader: BufReader, + mapper: JsonSectionPageMapperImpl, +} + +#[pymethods] +impl JsonSectionPageMapper { + #[new] + fn py_new(path: String) -> PyResult { + let file = File::open(Path::new(&path))?; + let mut reader = BufReader::new(file); + let mapper = JsonSectionPageMapperImpl::new(&mut reader)?; + + Ok(JsonSectionPageMapper { reader, mapper }) + } + + fn sections(self_: PyRef) -> PyResult { + let dict: HashMap<&str, HashMap<&str, usize>> = self_ + .mapper + .sections + .iter() + .map(|section| { + let nested_dict: HashMap<&str, usize> = + HashMap::from_iter([("offset", section.offset), ("size", section.size)]); + (section.key.as_str(), nested_dict) + }) + .collect(); + + Ok(dict.into_py(self_.py())) + } + + fn __len__(&self) -> PyResult { + Ok(self.mapper.sections.len()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_helpers::prepare_reader; + + #[test] + fn test_instance() { + const EXAMPLE: &str = r#"{"dm_format_version": "1.0", "media_type": 2, "infos": {"string": "test", "int": 0, "float": 0.0, "string_list": ["test0", "test1", "test2"], "int_list": [0, 1, 2], "float_list": [0.0, 0.1, 0.2]}, "categories": {"label": {"labels": [{"name": "cat0", "parent": "", "attributes": ["x", "y"]}, {"name": "cat1", "parent": "", "attributes": ["x", "y"]}, {"name": "cat2", "parent": "", "attributes": ["x", "y"]}, {"name": "cat3", "parent": "", "attributes": ["x", "y"]}, {"name": "cat4", "parent": "", "attributes": ["x", "y"]}], "label_groups": [], "attributes": ["a", "b", "score"]}, "mask": {"colormap": [{"label_id": 0, "r": 0, "g": 0, "b": 0}, {"label_id": 1, "r": 128, "g": 0, "b": 0}, {"label_id": 2, "r": 0, "g": 128, "b": 0}, {"label_id": 3, "r": 128, "g": 128, "b": 0}, {"label_id": 4, "r": 0, "g": 0, "b": 128}]}, "points": {"items": [{"label_id": 0, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 1, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 2, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 3, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 4, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}]}}, "items": [{"id": "42", "annotations": [{"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}, {"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}], "image": {"path": "42.jpg", "size": [10, 6]}}, {"id": "43", "annotations": [], "image": {"path": "43.qq", "size": [2, 4]}}]} + "#; + + let (tempfile, mut reader) = prepare_reader(EXAMPLE); + let json_section_page_mapper = JsonSectionPageMapperImpl::new(&mut reader).unwrap(); + + println!("{:?}", json_section_page_mapper); + + for section in json_section_page_mapper.sections { + let offset = section.offset; + let size = section.size; + reader.seek(io::SeekFrom::Start(offset as u64)); + let mut buf = vec![0; size]; + reader.read(buf.as_mut_slice()); + + let content: serde_json::Value = serde_json::from_str( + std::str::from_utf8(buf.as_slice()).expect("Cannot change to utf8"), + ) + .unwrap(); + println!("Section: {}, Content: {:?}", section.key, content); + } + } +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b7fb71ae0b..699066e306 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -4,6 +4,7 @@ mod coco_page_mapper; mod datum_page_mapper; +mod json_section_page_mapper; mod page_mapper; mod page_maps; mod test_helpers; @@ -12,6 +13,7 @@ use pyo3::prelude::*; use crate::coco_page_mapper::CocoPageMapper; use crate::datum_page_mapper::DatumPageMapper; +use crate::json_section_page_mapper::JsonSectionPageMapper; /// Datumaro Rust API #[pymodule] @@ -19,6 +21,7 @@ use crate::datum_page_mapper::DatumPageMapper; fn rust_api(_py: Python<'_>, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/datumaro/plugins/data_formats/datumaro/importer.py b/src/datumaro/plugins/data_formats/datumaro/importer.py index 4c925261bb..fbcc8da72a 100644 --- a/src/datumaro/plugins/data_formats/datumaro/importer.py +++ b/src/datumaro/plugins/data_formats/datumaro/importer.py @@ -8,7 +8,7 @@ from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext from datumaro.components.importer import Importer from datumaro.components.merge.extractor_merger import ExtractorMerger -from datumaro.util import parse_json +from datumaro.rust_api import JsonSectionPageMapper from .format import DatumaroPath @@ -28,9 +28,11 @@ def detect( with context.probe_text_file( annot_file, 'must be a JSON object with "categories" ' 'and "items" keys', - ) as f: - contents = parse_json(f.read()) - if not {"categories", "items"} <= contents.keys(): + ): + fpath = osp.join(context.root_path, annot_file) + page_mapper = JsonSectionPageMapper(fpath) + sections = page_mapper.sections() + if not {"categories", "items"} <= sections.keys(): raise Exception @classmethod