diff --git a/src/highdicom/io.py b/src/highdicom/io.py index 6edb18eb..70c182b2 100644 --- a/src/highdicom/io.py +++ b/src/highdicom/io.py @@ -19,7 +19,7 @@ read_partial ) from pydicom.tag import TupleTag, ItemTag, SequenceDelimiterTag -from pydicom.uid import UID +from pydicom.uid import UID, DeflatedExplicitVRLittleEndian from highdicom.frame import decode_frame from highdicom.color import ColorManager @@ -159,8 +159,13 @@ def _build_bot(fp: DicomFileLike, number_of_frames: int) -> List[int]: """ initial_position = fp.tell() - offset_values = [] - current_offset = 0 + + # We will keep two lists, one of all fragment boundaries (regardless of + # whether or not they are frame boundaries) and the other of just those + # frament boundaries that are known to be frame boundaries (as identified + # by JPEG start markers). + frame_offset_values = [] + fragment_offset_values = [] i = 0 while True: frame_position = fp.tell() @@ -187,26 +192,33 @@ def _build_bot(fp: DicomFileLike, number_of_frames: int) -> List[int]: f'Length of Frame item #{i} is zero.' ) - first_two_bytes = fp.read(2) - if not fp.is_little_endian: - first_two_bytes = first_two_bytes[::-1] + current_offset = frame_position - initial_position + fragment_offset_values.append(current_offset) + # In case of fragmentation, we only want to get the offsets to the # first fragment of a given frame. We can identify those based on the # JPEG and JPEG 2000 markers that should be found at the beginning and # end of the compressed byte stream. + first_two_bytes = fp.read(2) + if not fp.is_little_endian: + first_two_bytes = first_two_bytes[::-1] + if first_two_bytes in _START_MARKERS: - current_offset = frame_position - initial_position - offset_values.append(current_offset) + frame_offset_values.append(current_offset) i += 1 fp.seek(length - 2, 1) # minus the first two bytes - if len(offset_values) != number_of_frames: + if len(frame_offset_values) == number_of_frames: + basic_offset_table = frame_offset_values + elif len(fragment_offset_values) == number_of_frames: + # This covers RLE and others that have no frame markers but have a + # single fragment per frame + basic_offset_table = fragment_offset_values + else: raise ValueError( 'Number of frame items does not match specified Number of Frames.' ) - else: - basic_offset_table = offset_values fp.seek(initial_position, 0) return basic_offset_table @@ -426,6 +438,16 @@ def _read_metadata(self) -> None: self._metadata = Dataset(metadata) self._pixel_data_offset = self._fp.tell() + + if self.transfer_syntax_uid == DeflatedExplicitVRLittleEndian: + # The entire file is compressed with DEFLATE. These cannot be used + # since the entire file must be decompressed to read or build the + # basic/extended offset + raise ValueError( + 'Deflated transfer syntaxes cannot be used with the ' + 'ImageFileReader.' + ) + # Determine whether dataset contains a Pixel Data element try: tag = TupleTag(self._fp.read_tag()) diff --git a/tests/test_io.py b/tests/test_io.py index 07d2504c..8e1c698e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -6,8 +6,10 @@ from pydicom import dcmread from pydicom.data import get_testdata_file from pydicom.filebase import DicomBytesIO, DicomFileLike +import pytest from highdicom.io import ImageFileReader +from tests.utils import find_readable_images class TestImageFileReader(unittest.TestCase): @@ -212,3 +214,75 @@ def test_read_single_frame_ct_image_dicom_file_like_opened(self): reader.metadata.Columns, ) np.testing.assert_array_equal(frame, pixel_array) + + def test_read_rle_no_bot(self): + # This image is RLE compressed but has no BOT, requiring searching + # through the pixel data for delimiter tags + filename = Path(get_testdata_file('rtdose_rle.dcm')) + + dataset = dcmread(filename) + pixel_array = dataset.pixel_array + with ImageFileReader(filename) as reader: + assert reader.number_of_frames == 15 + for f in range(reader.number_of_frames): + frame = reader.read_frame(f, correct_color=False) + assert isinstance(frame, np.ndarray) + assert frame.ndim == 2 + assert frame.dtype == np.uint32 + assert frame.shape == ( + reader.metadata.Rows, + reader.metadata.Columns, + ) + np.testing.assert_array_equal(frame, pixel_array[f]) + + def test_disallow_deflated_dataset(self): + # Files with a deflated transfer + msg = ( + 'Deflated transfer syntaxes cannot be used with the ' + 'ImageFileReader.' + ) + filename = get_testdata_file('image_dfl.dcm') + + with pytest.raises(ValueError, match=msg): + with ImageFileReader(filename) as reader: + reader.read_frame(1) + + +@pytest.mark.parametrize( + 'filename', + find_readable_images(), +) +def test_all_images(filename): + dataset = dcmread(filename) + pixel_array = dataset.pixel_array + + is_color = dataset.SamplesPerPixel == 3 + number_of_frames = dataset.get('NumberOfFrames', 1) + is_multiframe = number_of_frames > 1 + + if is_color: + ndim = 3 + shape = ( + dataset.Rows, + dataset.Columns, + 3 + ) + else: + ndim = 2 + shape = ( + dataset.Rows, + dataset.Columns, + ) + + with ImageFileReader(filename) as reader: + assert reader.number_of_frames == number_of_frames + for f in range(reader.number_of_frames): + frame = reader.read_frame(f, correct_color=False) + assert isinstance(frame, np.ndarray) + assert frame.ndim == ndim + assert frame.dtype == pixel_array.dtype + assert frame.shape == shape + expected_frame = ( + pixel_array[f] if is_multiframe else pixel_array + ) + np.testing.assert_array_equal(frame, expected_frame) diff --git a/tests/utils.py b/tests/utils.py index f70233a8..a0d92193 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,9 +1,16 @@ from io import BytesIO +from pathlib import Path +from pydicom.data import get_testdata_files from pydicom.dataset import Dataset, FileMetaDataset from pydicom.filereader import dcmread +from highdicom._module_utils import ( + does_iod_have_pixel_data, +) + + def write_and_read_dataset(dataset: Dataset): """Write DICOM dataset to buffer and read it back from buffer.""" clone = Dataset(dataset) @@ -21,3 +28,64 @@ def write_and_read_dataset(dataset: Dataset): little_endian=little_endian, ) return dcmread(fp, force=True) + + +def find_readable_images() -> list[str]: + """Get a list of all images in highdicom and pydicom test data that should + be expected to work with image reading routines. + + """ + # All pydicom test files + all_files = get_testdata_files() + + # Add highdicom test files + file_path = Path(__file__) + data_dir = file_path.parent.parent.joinpath('data/test_files') + hd_files = [str(f) for f in data_dir.glob("*.dcm")] + + all_files.extend(hd_files) + + # Various files are not expected to work and should be excluded + exclusions = [ + "badVR.dcm", # cannot be read due to bad VFR + "MR_truncated.dcm", # pixel data is truncated + "liver_1frame.dcm", # missing number of frames + "JPEG2000-embedded-sequence-delimiter.dcm", # pydicom cannot decode pixels + "image_dfl.dcm", # deflated transfer syntax cannot be read lazily + "JPEG-lossy.dcm", # pydicom cannot decode pixels + "TINY_ALPHA", # no pixels + "SC_rgb_jpeg.dcm", # messed up transder syntax + ] + + files_to_use = [] + + for f in all_files: + try: + # Skip image files that can't even be opened (the test files + # include some deliberately corrupted files) + dcm = dcmread(f) + except: + continue + + excluded = False + if 'SOPClassUID' not in dcm: + # Some are missing this... + continue + if not does_iod_have_pixel_data(dcm.SOPClassUID): + # Exclude non images + continue + if not dcm.file_meta.TransferSyntaxUID.is_little_endian: + # We don't support little endian + continue + + for exc in exclusions: + if exc in f: + excluded = True + break + + if excluded: + continue + + files_to_use.append(f) + + return files_to_use