Fixes and test for ImageFileReader

ImagingDataCommons · Jan 9, 2025 · f60e013 · f60e013
1 parent 8c2c1e0
commit f60e013
Show file tree

Hide file tree

Showing 3 changed files with 175 additions and 11 deletions.
diff --git a/src/highdicom/io.py b/src/highdicom/io.py
@@ -19,7 +19,7 @@
     read_partial
 )
 from pydicom.tag import TupleTag, ItemTag, SequenceDelimiterTag
-from pydicom.uid import UID
+from pydicom.uid import UID, DeflatedExplicitVRLittleEndian
 
 from highdicom.frame import decode_frame
 from highdicom.color import ColorManager
@@ -159,8 +159,13 @@ def _build_bot(fp: DicomFileLike, number_of_frames: int) -> List[int]:
 
     """
     initial_position = fp.tell()
-    offset_values = []
-    current_offset = 0
+
+    # We will keep two lists, one of all fragment boundaries (regardless of
+    # whether or not they are frame boundaries) and the other of just those
+    # frament boundaries that are known to be frame boundaries (as identified
+    # by JPEG start markers).
+    frame_offset_values = []
+    fragment_offset_values = []
     i = 0
     while True:
         frame_position = fp.tell()
@@ -187,26 +192,33 @@ def _build_bot(fp: DicomFileLike, number_of_frames: int) -> List[int]:
                 f'Length of Frame item #{i} is zero.'
             )
 
-        first_two_bytes = fp.read(2)
-        if not fp.is_little_endian:
-            first_two_bytes = first_two_bytes[::-1]
+        current_offset = frame_position - initial_position
+        fragment_offset_values.append(current_offset)
+
         # In case of fragmentation, we only want to get the offsets to the
         # first fragment of a given frame. We can identify those based on the
         # JPEG and JPEG 2000 markers that should be found at the beginning and
         # end of the compressed byte stream.
+        first_two_bytes = fp.read(2)
+        if not fp.is_little_endian:
+            first_two_bytes = first_two_bytes[::-1]
+
         if first_two_bytes in _START_MARKERS:
-            current_offset = frame_position - initial_position
-            offset_values.append(current_offset)
+            frame_offset_values.append(current_offset)
 
         i += 1
         fp.seek(length - 2, 1)  # minus the first two bytes
 
-    if len(offset_values) != number_of_frames:
+    if len(frame_offset_values) == number_of_frames:
+        basic_offset_table = frame_offset_values
+    elif len(fragment_offset_values) == number_of_frames:
+        # This covers RLE and others that have no frame markers but have a
+        # single fragment per frame
+        basic_offset_table = fragment_offset_values
+    else:
         raise ValueError(
             'Number of frame items does not match specified Number of Frames.'
         )
-    else:
-        basic_offset_table = offset_values
 
     fp.seek(initial_position, 0)
     return basic_offset_table
@@ -426,6 +438,16 @@ def _read_metadata(self) -> None:
         self._metadata = Dataset(metadata)
 
         self._pixel_data_offset = self._fp.tell()
+
+        if self.transfer_syntax_uid == DeflatedExplicitVRLittleEndian:
+            # The entire file is compressed with DEFLATE. These cannot be used
+            # since the entire file must be decompressed to read or build the
+            # basic/extended offset
+            raise ValueError(
+                'Deflated transfer syntaxes cannot be used with the '
+                'ImageFileReader.'
+            )
+
         # Determine whether dataset contains a Pixel Data element
         try:
             tag = TupleTag(self._fp.read_tag())

diff --git a/tests/test_io.py b/tests/test_io.py
@@ -6,8 +6,10 @@
 from pydicom import dcmread
 from pydicom.data import get_testdata_file
 from pydicom.filebase import DicomBytesIO, DicomFileLike
+import pytest
 
 from highdicom.io import ImageFileReader
+from tests.utils import find_readable_images
 
 
 class TestImageFileReader(unittest.TestCase):
@@ -212,3 +214,75 @@ def test_read_single_frame_ct_image_dicom_file_like_opened(self):
                 reader.metadata.Columns,
             )
             np.testing.assert_array_equal(frame, pixel_array)
+
+    def test_read_rle_no_bot(self):
+        # This image is RLE compressed but has no BOT, requiring searching
+        # through the pixel data for delimiter tags
+        filename = Path(get_testdata_file('rtdose_rle.dcm'))
+
+        dataset = dcmread(filename)
+        pixel_array = dataset.pixel_array
+        with ImageFileReader(filename) as reader:
+            assert reader.number_of_frames == 15
+            for f in range(reader.number_of_frames):
+                frame = reader.read_frame(f, correct_color=False)
+                assert isinstance(frame, np.ndarray)
+                assert frame.ndim == 2
+                assert frame.dtype == np.uint32
+                assert frame.shape == (
+                    reader.metadata.Rows,
+                    reader.metadata.Columns,
+                )
+                np.testing.assert_array_equal(frame, pixel_array[f])
+
+    def test_disallow_deflated_dataset(self):
+        # Files with a deflated transfer
+        msg = (
+            'Deflated transfer syntaxes cannot be used with the '
+            'ImageFileReader.'
+        )
+        filename = get_testdata_file('image_dfl.dcm')
+
+        with pytest.raises(ValueError, match=msg):
+            with ImageFileReader(filename) as reader:
+                reader.read_frame(1)
+
+
+@pytest.mark.parametrize(
+    'filename',
+    find_readable_images(),
+)
+def test_all_images(filename):
+    dataset = dcmread(filename)
+    pixel_array = dataset.pixel_array
+
+    is_color = dataset.SamplesPerPixel == 3
+    number_of_frames = dataset.get('NumberOfFrames', 1)
+    is_multiframe = number_of_frames > 1
+
+    if is_color:
+        ndim = 3
+        shape = (
+            dataset.Rows,
+            dataset.Columns,
+            3
+        )
+    else:
+        ndim = 2
+        shape = (
+            dataset.Rows,
+            dataset.Columns,
+        )
+
+    with ImageFileReader(filename) as reader:
+        assert reader.number_of_frames == number_of_frames
+        for f in range(reader.number_of_frames):
+            frame = reader.read_frame(f, correct_color=False)
+            assert isinstance(frame, np.ndarray)
+            assert frame.ndim == ndim
+            assert frame.dtype == pixel_array.dtype
+            assert frame.shape == shape
+            expected_frame = (
+                pixel_array[f] if is_multiframe else pixel_array
+            )
+            np.testing.assert_array_equal(frame, expected_frame)
diff --git a/tests/utils.py b/tests/utils.py
@@ -1,9 +1,16 @@
 from io import BytesIO
 
+from pathlib import Path
+from pydicom.data import get_testdata_files
 from pydicom.dataset import Dataset, FileMetaDataset
 from pydicom.filereader import dcmread
 
 
+from highdicom._module_utils import (
+    does_iod_have_pixel_data,
+)
+
+
 def write_and_read_dataset(dataset: Dataset):
     """Write DICOM dataset to buffer and read it back from buffer."""
     clone = Dataset(dataset)
@@ -21,3 +28,64 @@ def write_and_read_dataset(dataset: Dataset):
             little_endian=little_endian,
         )
         return dcmread(fp, force=True)
+
+
+def find_readable_images() -> list[str]:
+    """Get a list of all images in highdicom and pydicom test data that should
+    be expected to work with image reading routines.
+
+    """
+    # All pydicom test files
+    all_files = get_testdata_files()
+
+    # Add highdicom test files
+    file_path = Path(__file__)
+    data_dir = file_path.parent.parent.joinpath('data/test_files')
+    hd_files = [str(f) for f in data_dir.glob("*.dcm")]
+
+    all_files.extend(hd_files)
+
+    # Various files are not expected to work and should be excluded
+    exclusions = [
+        "badVR.dcm",  # cannot be read due to bad VFR
+        "MR_truncated.dcm",  # pixel data is truncated
+        "liver_1frame.dcm",  # missing number of frames
+        "JPEG2000-embedded-sequence-delimiter.dcm",  # pydicom cannot decode pixels
+        "image_dfl.dcm",  # deflated transfer syntax cannot be read lazily
+        "JPEG-lossy.dcm",  # pydicom cannot decode pixels
+        "TINY_ALPHA",  # no pixels
+        "SC_rgb_jpeg.dcm",  # messed up transder syntax
+    ]
+
+    files_to_use = []
+
+    for f in all_files:
+        try:
+            # Skip image files that can't even be opened (the test files
+            # include some deliberately corrupted files)
+            dcm = dcmread(f)
+        except:
+            continue
+
+        excluded = False
+        if 'SOPClassUID' not in dcm:
+            # Some are missing this...
+            continue
+        if not does_iod_have_pixel_data(dcm.SOPClassUID):
+            # Exclude non images
+            continue
+        if not dcm.file_meta.TransferSyntaxUID.is_little_endian:
+            # We don't support little endian
+            continue
+
+        for exc in exclusions:
+            if exc in f:
+                excluded = True
+                break
+
+        if excluded:
+            continue
+
+        files_to_use.append(f)
+
+    return files_to_use