Skip to content

Commit

Permalink
Develop JsonSectionPageMapper in Rust API (#1224)
Browse files Browse the repository at this point in the history
### Summary

- Ticket no. 127135 and 127136.
- Develop `JsonSectionPageMapper` to construct page maps for top-level
sections in a given JSON file.
- Enhance `DatumaroImporter.detect()`'s performance by replacing JSON
file parsing logic with the `JsonSectionPageMapper`.

### How to test
Our existing test will validate its functionality. For the performance
comparison, please see the following.

- Before
```python
from datumaro.rust_api import JsonSectionPageMapper
from time import time
import datumaro as dm

start = time()
format = dm.Dataset.detect("ws_test/coco/datumaro")
dt = 1000.0 * (time() - start)
print(f"Duration for detecting Datumaro data format: {dt:.1f}ms, format={format}")
```

```console
Duration for detecting Datumaro data format: 25784.5ms, format=datumaro
```

- After
```python
from datumaro.rust_api import JsonSectionPageMapper
from time import time
import datumaro as dm

start = time()
format = dm.Dataset.detect("ws_test/coco/datumaro")
dt = 1000.0 * (time() - start)
print(f"Duration for detecting Datumaro data format: {dt:.1f}ms, format={format}")
```

```console
Duration for detecting Datumaro data format: 17234.7ms, format=datumaro
```

It saves ~7 secs.

### Checklist
<!-- Put an 'x' in all the boxes that apply -->
- [ ] I have added unit tests to cover my changes.​
- [ ] I have added integration tests to cover my changes.​
- [x] I have added the description of my changes into
[CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​
- [ ] I have updated the
[documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs)
accordingly

### License

- [x] I submit _my code changes_ under the same [MIT
License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE)
that covers the project.
  Feel free to contact the maintainers if that's a concern.
- [x] I have updated the license header for each file (see an example
below).

```python
# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
```

---------

Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>
  • Loading branch information
vinnamkim authored Dec 14, 2023
1 parent 833ee1d commit 6b567b7
Show file tree
Hide file tree
Showing 4 changed files with 243 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### New features
- Support MMDetection COCO format
(<https://github.com/openvinotoolkit/datumaro/pull/1213>)
- Develop JsonSectionPageMapper in Rust API
(<https://github.com/openvinotoolkit/datumaro/pull/1224>)

### Enhancements
- Optimize Python import to make CLI entrypoint faster
Expand Down
232 changes: 232 additions & 0 deletions rust/src/json_section_page_mapper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
// Copyright (C) 2023 Intel Corporation
//
// SPDX-License-Identifier: MIT

use crate::{
page_mapper::{JsonPageMapper, ParsedJsonSection},
utils::read_skipping_ws,
};
use pyo3::{prelude::*, types::PyDict};
use std::{
collections::HashMap,
fs::File,
io::{self, BufReader, Read, Seek},
path::Path,
};

#[derive(Debug)]
struct JsonSection {
key: String,
offset: usize,
size: usize,
}

fn handle_arr_or_dict(
mut stack: Vec<u8>,
mut reader: impl Read + Seek,
mut last_token: u8,
) -> Result<(), io::Error> {
while stack.len() != 0 {
match read_skipping_ws(&mut reader) {
Ok(c) => match c {
b'{' | b'[' => {
stack.push(c);
last_token = c;
}
b'}' => {
if last_token != b'{' {
let cur_pos = reader.stream_position()?;
let msg = format!("Last token in the stack is '{}', but the given token at offset={} is '}}'", last_token as char, cur_pos);
return Err(io::Error::new(io::ErrorKind::InvalidData, msg));
}
stack.pop();
if stack.len() != 0 {
last_token = *stack
.last()
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
}
}
b']' => {
if last_token != b'[' {
let cur_pos = reader.stream_position()?;
let msg = format!("Last token in the stack is '{}', but the given token at offset={} is ']'", last_token as char, cur_pos);
return Err(io::Error::new(io::ErrorKind::InvalidData, msg));
}
stack.pop();
if stack.len() != 0 {
last_token = *stack
.last()
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
}
}
b'"' => {
while let Ok(c) = read_skipping_ws(&mut reader) {
if c == b'"' {
break;
}
}
}
_ => {}
},
Err(err) => {
return Err(err);
}
}
}
Ok(())
}

fn handle_string(mut reader: impl Read + Seek) -> Result<(), io::Error> {
while let Ok(c) = read_skipping_ws(&mut reader) {
if c == b'"' {
break;
}
}
Ok(())
}

fn get_offset(mut reader: impl Read + Seek, stack: &mut Vec<u8>) -> Result<usize, io::Error> {
let mut offset = usize::MAX;
while let Ok(c) = read_skipping_ws(&mut reader) {
stack.push(c);
match c {
b'{' | b'[' | b'"' => {
return Ok(reader.stream_position()? as usize - 1);
}
b',' => {
return Ok(offset - 1);
}
_ => {
let pos = reader.stream_position()? as usize;
offset = std::cmp::min(pos, offset);
}
}
}
Err(io::Error::new(
io::ErrorKind::InvalidData,
"Cannot get offset",
))
}

impl ParsedJsonSection for JsonSection {
fn parse(buf_key: String, mut reader: impl Read + Seek) -> Result<Box<JsonSection>, io::Error> {
// Move reader's cursor right after ':'
while let Ok(c) = read_skipping_ws(&mut reader) {
if c == b':' {
break;
}
}

let mut stack = vec![];

let start_offset = get_offset(&mut reader, &mut stack)?;

let last_token = *stack
.last()
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;

let end_offset = match last_token {
b'[' | b'{' => {
let _ = handle_arr_or_dict(stack, &mut reader, last_token)?;
Ok(reader.stream_position()? as usize)
}
b'"' => {
let _ = handle_string(&mut reader)?;
Ok(reader.stream_position()? as usize)
}
b',' => Ok(reader.stream_position()? as usize - 1),
_ => Err(io::Error::new(io::ErrorKind::InvalidData, "s")),
}?;

let size = end_offset - start_offset;

Ok(Box::new(JsonSection {
key: buf_key,
offset: start_offset,
size: size,
}))
}
}

#[derive(Debug)]
pub struct JsonSectionPageMapperImpl {
sections: Vec<Box<JsonSection>>,
}

impl JsonPageMapper<JsonSection> for JsonSectionPageMapperImpl {}

impl JsonSectionPageMapperImpl {
pub fn new(mut reader: impl Read + Seek) -> Result<Self, io::Error> {
let sections = Self::parse_json(&mut reader)?;

Ok(JsonSectionPageMapperImpl { sections: sections })
}
}

#[pyclass]
pub struct JsonSectionPageMapper {
reader: BufReader<File>,
mapper: JsonSectionPageMapperImpl,
}

#[pymethods]
impl JsonSectionPageMapper {
#[new]
fn py_new(path: String) -> PyResult<Self> {
let file = File::open(Path::new(&path))?;
let mut reader = BufReader::new(file);
let mapper = JsonSectionPageMapperImpl::new(&mut reader)?;

Ok(JsonSectionPageMapper { reader, mapper })
}

fn sections(self_: PyRef<Self>) -> PyResult<PyObject> {
let dict: HashMap<&str, HashMap<&str, usize>> = self_
.mapper
.sections
.iter()
.map(|section| {
let nested_dict: HashMap<&str, usize> =
HashMap::from_iter([("offset", section.offset), ("size", section.size)]);
(section.key.as_str(), nested_dict)
})
.collect();

Ok(dict.into_py(self_.py()))
}

fn __len__(&self) -> PyResult<usize> {
Ok(self.mapper.sections.len())
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::test_helpers::prepare_reader;

#[test]
fn test_instance() {
const EXAMPLE: &str = r#"{"dm_format_version": "1.0", "media_type": 2, "infos": {"string": "test", "int": 0, "float": 0.0, "string_list": ["test0", "test1", "test2"], "int_list": [0, 1, 2], "float_list": [0.0, 0.1, 0.2]}, "categories": {"label": {"labels": [{"name": "cat0", "parent": "", "attributes": ["x", "y"]}, {"name": "cat1", "parent": "", "attributes": ["x", "y"]}, {"name": "cat2", "parent": "", "attributes": ["x", "y"]}, {"name": "cat3", "parent": "", "attributes": ["x", "y"]}, {"name": "cat4", "parent": "", "attributes": ["x", "y"]}], "label_groups": [], "attributes": ["a", "b", "score"]}, "mask": {"colormap": [{"label_id": 0, "r": 0, "g": 0, "b": 0}, {"label_id": 1, "r": 128, "g": 0, "b": 0}, {"label_id": 2, "r": 0, "g": 128, "b": 0}, {"label_id": 3, "r": 128, "g": 128, "b": 0}, {"label_id": 4, "r": 0, "g": 0, "b": 128}]}, "points": {"items": [{"label_id": 0, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 1, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 2, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 3, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 4, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}]}}, "items": [{"id": "42", "annotations": [{"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}, {"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}], "image": {"path": "42.jpg", "size": [10, 6]}}, {"id": "43", "annotations": [], "image": {"path": "43.qq", "size": [2, 4]}}]}
"#;

let (tempfile, mut reader) = prepare_reader(EXAMPLE);
let json_section_page_mapper = JsonSectionPageMapperImpl::new(&mut reader).unwrap();

println!("{:?}", json_section_page_mapper);

for section in json_section_page_mapper.sections {
let offset = section.offset;
let size = section.size;
reader.seek(io::SeekFrom::Start(offset as u64));
let mut buf = vec![0; size];
reader.read(buf.as_mut_slice());

let content: serde_json::Value = serde_json::from_str(
std::str::from_utf8(buf.as_slice()).expect("Cannot change to utf8"),
)
.unwrap();
println!("Section: {}, Content: {:?}", section.key, content);
}
}
}
3 changes: 3 additions & 0 deletions rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

mod coco_page_mapper;
mod datum_page_mapper;
mod json_section_page_mapper;
mod page_mapper;
mod page_maps;
mod test_helpers;
Expand All @@ -12,13 +13,15 @@ use pyo3::prelude::*;

use crate::coco_page_mapper::CocoPageMapper;
use crate::datum_page_mapper::DatumPageMapper;
use crate::json_section_page_mapper::JsonSectionPageMapper;

/// Datumaro Rust API
#[pymodule]
#[pyo3(name = "rust_api")]
fn rust_api(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_class::<CocoPageMapper>()?;
m.add_class::<DatumPageMapper>()?;
m.add_class::<JsonSectionPageMapper>()?;

Ok(())
}
10 changes: 6 additions & 4 deletions src/datumaro/plugins/data_formats/datumaro/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.importer import Importer
from datumaro.components.merge.extractor_merger import ExtractorMerger
from datumaro.util import parse_json
from datumaro.rust_api import JsonSectionPageMapper

from .format import DatumaroPath

Expand All @@ -28,9 +28,11 @@ def detect(
with context.probe_text_file(
annot_file,
'must be a JSON object with "categories" ' 'and "items" keys',
) as f:
contents = parse_json(f.read())
if not {"categories", "items"} <= contents.keys():
):
fpath = osp.join(context.root_path, annot_file)
page_mapper = JsonSectionPageMapper(fpath)
sections = page_mapper.sections()
if not {"categories", "items"} <= sections.keys():
raise Exception

@classmethod
Expand Down

0 comments on commit 6b567b7

Please # to comment.