Skip to content

Commit

Permalink
Error on UTF8 surrogates
Browse files Browse the repository at this point in the history
  • Loading branch information
ijl committed Jan 3, 2019
1 parent a87e356 commit 432b159
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 13 deletions.
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ support subclasses.
It raises `TypeError` on an unsupported type. This exception message
describes the invalid object.

It raises `TypeError` on a `str` that contains invalid UTF-8.

It raises `TypeError` on an integer that exceeds 64 bits. This is the same
as the standard library's `json` module.

Expand Down Expand Up @@ -100,6 +102,36 @@ b'{"bool":true,"\xf0\x9f\x90\x88":"\xe5\x93\x88\xe5\x93\x88","int":9223372036854
'{"bool": true, "\\ud83d\\udc08": "\\u54c8\\u54c8", "int": 9223372036854775807, "float": 1.337e+40}'
```

### UTF-8

orjson raises an exception on invalid UTF-8. This is
necessary because Python 3 str objects may contain UTF-16 surrogates. The
standard library's json module accepts invalid UTF-8.

```python
>>> import orjson, ujson, rapidjson, json
>>> orjson.dumps('\ud800')
TypeError: str is not valid UTF-8: surrogates not allowed
>>> ujson.dumps('\ud800')
UnicodeEncodeError: 'utf-8' codec ...
>>> rapidjson.dumps('\ud800')
UnicodeEncodeError: 'utf-8' codec ...
>>> json.dumps('\ud800')
'"\\ud800"'
```

```python
>>> import orjson, ujson, rapidjson, json
>>> orjson.loads('"\\ud800"')
JSONDecodeError: unexpected end of hex escape at line 1 column 8: line 1 column 1 (char 0)
>>> ujson.loads('"\\ud800"')
''
>>> rapidjson.loads('"\\ud800"')
ValueError: Parse error at offset 1: The surrogate pair in string is invalid.
>>> json.loads('"\\ud800"')
'\ud800'
```

## Testing

The library has comprehensive tests. There are unit tests against the
Expand All @@ -108,7 +140,8 @@ roundtrip, jsonchecker, and fixtures files of the
repository. It is tested to not crash against the
[Big List of Naughty Strings](https://github.com/minimaxir/big-list-of-naughty-strings).
It is tested to not leak memory. It is tested to be correct against
input from the PyJFuzz JSON fuzzer. There are integration tests
input from the PyJFuzz JSON fuzzer. It is tested to not crash
against and not accept invalid UTF-8. There are integration tests
exercising the library's use in web servers (uwsgi and gunicorn,
using multiprocess/forked workers) and when
multithreaded. It also uses some tests from the ultrajson library.
Expand Down
21 changes: 14 additions & 7 deletions src/decode.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: (Apache-2.0 OR MIT)

use crate::typeref;
use crate::exc::*;
use pyo3::prelude::*;
use serde::de::{self, DeserializeSeed, Deserializer, MapAccess, SeqAccess, Visitor};
use smallvec::SmallVec;
Expand All @@ -9,23 +10,29 @@ use std::fmt;
use std::marker::PhantomData;
use std::os::raw::c_char;

import_exception!(json, JSONDecodeError);

pub fn deserialize(py: Python, ptr: *mut pyo3::ffi::PyObject) -> PyResult<PyObject> {
let obj_type_ptr = unsafe { (*ptr).ob_type };
let data: Cow<str>;
if unsafe { obj_type_ptr == typeref::STR_PTR } {
let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(ptr, &mut str_size) as *const u8 };
if unsafe { std::intrinsics::unlikely(uni.is_null()) } {
return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
}
data = unsafe {
Cow::Borrowed(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
pyo3::ffi::PyUnicode_AsUTF8AndSize(ptr, &mut str_size) as *const u8,
str_size as usize,
)))
Cow::Borrowed(std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize)))
};
} else if unsafe { obj_type_ptr == typeref::BYTES_PTR } {
let buffer = unsafe { pyo3::ffi::PyBytes_AsString(ptr) as *const u8 };
let length = unsafe { pyo3::ffi::PyBytes_Size(ptr) as usize };
data = unsafe { String::from_utf8_lossy(std::slice::from_raw_parts(buffer, length)) };
match String::from_utf8(unsafe { std::slice::from_raw_parts(buffer, length).to_vec() }) {
Ok(string) => {
data = Cow::Owned(string);
},
Err(_) => {
return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
}
}
} else {
return Err(pyo3::exceptions::TypeError::py_err(
"Input must be str or bytes",
Expand Down
23 changes: 19 additions & 4 deletions src/encode.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: (Apache-2.0 OR MIT)

use crate::typeref::*;
use crate::exc::*;
use pyo3::prelude::*;
use serde::ser::{self, Serialize, SerializeMap, SerializeSeq, Serializer};
use std::ffi::CStr;
Expand Down Expand Up @@ -34,10 +35,12 @@ impl<'p> Serialize for SerializePyObject {
let obj_ptr = unsafe { (*self.ptr).ob_type };
if unsafe { obj_ptr == STR_PTR } {
let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
let data =
unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(self.ptr, &mut str_size) as *const u8 };
let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(self.ptr, &mut str_size) as *const u8 };
if unsafe { std::intrinsics::unlikely(uni.is_null()) } {
return Err(ser::Error::custom(INVALID_STR));
}
serializer.serialize_str(unsafe {
std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, str_size as usize))
std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize))
})
} else if unsafe { obj_ptr == FLOAT_PTR } {
serializer.serialize_f64(unsafe { pyo3::ffi::PyFloat_AsDouble(self.ptr) })
Expand Down Expand Up @@ -70,6 +73,9 @@ impl<'p> Serialize for SerializePyObject {
let data = unsafe {
pyo3::ffi::PyUnicode_AsUTF8AndSize(key, &mut str_size) as *const u8
};
if unsafe { std::intrinsics::unlikely(data.is_null()) } {
return Err(ser::Error::custom(INVALID_STR));
}
map.serialize_entry(
unsafe {
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
Expand Down Expand Up @@ -118,8 +124,17 @@ impl<'p> Serialize for SerializePyObject {
} else if unsafe { obj_ptr == BYTES_PTR } {
let buffer = unsafe { pyo3::ffi::PyBytes_AsString(self.ptr) as *const u8 };
let length = unsafe { pyo3::ffi::PyBytes_Size(self.ptr) as usize };
let pystr = unsafe { pyo3::ffi::PyUnicode_FromStringAndSize(
buffer as *const c_char,
length as pyo3::ffi::Py_ssize_t,
) };
if unsafe { std::intrinsics::unlikely(pystr.is_null()) } {
return Err(ser::Error::custom(INVALID_STR));
}
let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(pystr, &mut str_size) as *const u8 };
serializer.serialize_str(unsafe {
std::str::from_utf8_unchecked(std::slice::from_raw_parts(buffer, length))
std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize))
})
} else {
Err(ser::Error::custom(format_args!(
Expand Down
5 changes: 5 additions & 0 deletions src/exc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// SPDX-License-Identifier: (Apache-2.0 OR MIT)

pub const INVALID_STR: &str = "str is not valid UTF-8: surrogates not allowed";

import_exception!(json, JSONDecodeError);
3 changes: 2 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use pyo3::ToPyPointer;

mod decode;
mod encode;
mod exc;
mod typeref;

#[pymodule]
Expand All @@ -23,7 +24,7 @@ fn orjson(py: Python, m: &PyModule) -> PyResult<()> {
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
m.add_wrapped(wrap_function!(dumps))?;
m.add_wrapped(wrap_function!(loads))?;
m.add("JSONDecodeError", py.get_type::<decode::JSONDecodeError>())?;
m.add("JSONDecodeError", py.get_type::<exc::JSONDecodeError>())?;
Ok(())
}

Expand Down
26 changes: 26 additions & 0 deletions test/test_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,32 @@ def test_str(self):
self.assertEqual(orjson.dumps(obj), ref)
self.assertEqual(orjson.loads(ref), obj)

def test_str_replacement(self):
"""
str roundtrip �
"""
self.assertEqual(orjson.dumps('�'), b'"\xef\xbf\xbd"')
self.assertEqual(orjson.loads(b'"\xef\xbf\xbd"'), '�')

def test_str_surrogates_loads(self):
"""
str unicode surrogates loads()
"""
self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\ud800"')
self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\ud83d\ude80"')
self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\udcff"')
self.assertRaises(orjson.JSONDecodeError, orjson.loads, b'"\xed\xa0\xbd\xed\xba\x80"') # \ud83d\ude80

def test_str_surrogates_dumps(self):
"""
str unicode surrogates dumps()
"""
self.assertRaises(TypeError, orjson.dumps, '\ud800')
self.assertRaises(TypeError, orjson.dumps, '\ud83d\ude80')
self.assertRaises(TypeError, orjson.dumps, '\udcff')
self.assertRaises(TypeError, orjson.dumps, {'\ud83d\ude80': None})
self.assertRaises(TypeError, orjson.dumps, b'\xed\xa0\xbd\xed\xba\x80') # \ud83d\ude80

def test_bytes(self):
"""
bytes
Expand Down

0 comments on commit 432b159

Please # to comment.