Skip to content

Commit

Permalink
Fix for unpacking / provenance store path (github issue #211)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelzwiers committed Jan 24, 2024
1 parent c75c852 commit ac0b935
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 75 deletions.
55 changes: 28 additions & 27 deletions bidscoin/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __init__(self, provenance: Union[str, Path]='', plugins: dict=None, dataform
A source data type (e.g. DICOM or PAR) that can be converted to BIDS by the plugins
:param provenance: The full path of a representative file for this data source
:param plugins: The plugins that are used to interact with the source data type
:param plugins: The plugins that are used to interact with the source data type. Uses bidsmap['Options']['plugins'] format
:param dataformat: The dataformat name in the bidsmap, e.g. DICOM or PAR
:param datatype: The intended BIDS data type of the data source TODO: move to a separate BidsTarget/Mapping class
:param subprefix: The subprefix used in the sourcefolder
Expand Down Expand Up @@ -314,68 +314,69 @@ def dynamicvalue(self, value, cleanup: bool=True, runtime: bool=False):
return value


def unpack(sourcefolder: Path, wildcard: str='', workfolder: Path='') -> Tuple[List[Path], bool]:
def unpack(sesfolder: Path, wildcard: str='', workfolder: Path='', _subprefix: str='') -> Tuple[Set[Path], bool]:
"""
Unpacks and sorts DICOM files in sourcefolder to a temporary folder if sourcefolder contains a DICOMDIR file or .tar.gz, .gz or .zip files
:param sourcefolder: The full pathname of the folder with the source data
:param wildcard: A glob search pattern to select the tarball/zipped files (leave empty to skip unzipping)
:param workfolder: A root folder for temporary data
:return: Either ([unpacked and sorted session folders], True), or ([sourcefolder], False)
:param sesfolder: The full pathname of the folder with the source data
:param wildcard: A glob search pattern to select the tarball/zipped files (leave empty to skip unzipping)
:param workfolder: A root folder for temporary data
:param _subprefix: A pytest helper variable that is passed to dicomsort.sortsessions(args, subprefix=_subprefix)
:return: Either ([unpacked and sorted session folders], True), or ([sourcefolder], False)
"""

# Search for zipped/tarball files
tarzipfiles = list(sourcefolder.glob(wildcard)) if wildcard else []
tarzipfiles = list(sesfolder.glob(wildcard)) if wildcard else []

# See if we have a flat unsorted (DICOM) data organization, i.e. no directories, but DICOM-files
flatDICOM = not lsdirs(sourcefolder) and get_dicomfile(sourcefolder).is_file()
flatDICOM = not lsdirs(sesfolder) and get_dicomfile(sesfolder).is_file()

# Check if we are going to do unpacking and/or sorting
if tarzipfiles or flatDICOM or (sourcefolder/'DICOMDIR').is_file():
if tarzipfiles or flatDICOM or (sesfolder/'DICOMDIR').is_file():

if tarzipfiles:
LOGGER.info(f"Found zipped/tarball data in: {sourcefolder}")
LOGGER.info(f"Found zipped/tarball data in: {sesfolder}")
else:
LOGGER.info(f"Detected a {'flat' if flatDICOM else 'DICOMDIR'} data-structure in: {sourcefolder}")
LOGGER.info(f"Detected a {'flat' if flatDICOM else 'DICOMDIR'} data-structure in: {sesfolder}")

# Create a (temporary) sub/ses workfolder for unpacking the data
if not workfolder:
workfolder = Path(tempfile.mkdtemp(dir=tempfile.gettempdir()))
else:
workfolder = Path(workfolder)/next(tempfile._get_candidate_names())
worksubses = workfolder/sourcefolder.relative_to(sourcefolder.anchor)
worksubses.mkdir(parents=True, exist_ok=True)
worksesfolder = workfolder/sesfolder.relative_to(sesfolder.anchor)
worksesfolder.mkdir(parents=True, exist_ok=True)

# Copy everything over to the workfolder
LOGGER.info(f"Making temporary copy: {sourcefolder} -> {worksubses}")
shutil.copytree(sourcefolder, worksubses, dirs_exist_ok=True)
LOGGER.info(f"Making temporary copy: {sesfolder} -> {worksesfolder}")
shutil.copytree(sesfolder, worksesfolder, dirs_exist_ok=True)

# Unpack the zip/tarball files in the temporary folder
sessions = []
recursive = False
for tarzipfile in [worksubses/tarzipfile.name for tarzipfile in tarzipfiles]:
LOGGER.info(f"Unpacking: {tarzipfile.name} -> {worksubses}")
sessions: Set[Path] = set()
recursive = False
for tarzipfile in [worksesfolder/tarzipfile.name for tarzipfile in tarzipfiles]:
LOGGER.info(f"Unpacking: {tarzipfile.name} -> {worksesfolder}")
try:
shutil.unpack_archive(tarzipfile, worksubses)
shutil.unpack_archive(tarzipfile, worksesfolder)
except Exception as unpackerror:
LOGGER.warning(f"Could not unpack: {tarzipfile}\n{unpackerror}")
continue

# Sort the DICOM files in the worksubses rootfolder immediately (to avoid name collisions)
if not (worksubses/'DICOMDIR').is_file():
if get_dicomfile(worksubses).name:
sessions += dicomsort.sortsessions(worksubses, recursive=False)
# Sort the DICOM files in the worksesfolder root immediately (to avoid name collisions)
if not (worksesfolder/'DICOMDIR').is_file():
if get_dicomfile(worksesfolder).name:
sessions.update(dicomsort.sortsessions(worksesfolder, _subprefix, recursive=False))
else:
recursive = True # The unzipped data may have leading directory components

# Sort the DICOM files if not sorted yet (e.g. DICOMDIR)
sessions = sorted(set(sessions + dicomsort.sortsessions(worksubses, recursive=recursive)))
sessions.update(dicomsort.sortsessions(worksesfolder, _subprefix, recursive=recursive))

return sessions, True

else:

return [sourcefolder], False
return {sesfolder}, False


def is_dicomfile(file: Path) -> bool:
Expand Down Expand Up @@ -482,7 +483,7 @@ def get_parfiles(folder: Path) -> List[Path]:
LOGGER.verbose(f"Ignoring hidden folder: {folder}")
return []

parfiles = []
parfiles: List[Path] = []
for file in sorted(folder.iterdir()):
if file.name.startswith('.'):
LOGGER.verbose(f"Ignoring hidden file: {file}")
Expand Down
52 changes: 23 additions & 29 deletions bidscoin/utilities/dicomsort.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import uuid
from pydicom import fileset
from pathlib import Path
from typing import List
from typing import List, Set
from importlib.util import find_spec
if find_spec('bidscoin') is None:
import sys
Expand Down Expand Up @@ -136,12 +136,12 @@ def sortsession(sessionfolder: Path, dicomfiles: List[Path], folderscheme: str,


def sortsessions(sourcefolder: Path, subprefix: str='', sesprefix: str='', folderscheme: str='{SeriesNumber:03d}-{SeriesDescription}',
namescheme: str='', pattern: str=r'.*\.(IMA|dcm)$', recursive: bool=True, force: bool=False, dryrun: bool=False) -> List[Path]:
namescheme: str='', pattern: str=r'.*\.(IMA|dcm)$', recursive: bool=True, force: bool=False, dryrun: bool=False) -> Set[Path]:
"""
Wrapper around sortsession() to loop over subjects and sessions and map the session DICOM files
:param sourcefolder: The root folder containing the source [sub/][ses/]dicomfiles or the DICOMDIR file
:param subprefix: The prefix for searching the sub folders in session
:param subprefix: The prefix for searching the sub folders in session. Use '' to sort DICOMDIR files directly in sourcefolder
:param sesprefix: The prefix for searching the ses folders in sub folder
:param folderscheme: Optional naming scheme for the sorted (e.g. Series) subfolders. Follows the Python string formatting syntax with DICOM field names in curly bracers with an optional number of digits for numeric fields', default='{SeriesNumber:03d}-{SeriesDescription}'
:param namescheme: Optional naming scheme for renaming the files. Follows the Python string formatting syntax with DICOM field names in curly bracers, e.g. {PatientName}_{SeriesNumber:03d}_{SeriesDescription}_{AcquisitionNumber:05d}_{InstanceNumber:05d}.IMA
Expand All @@ -159,19 +159,24 @@ def sortsessions(sourcefolder: Path, subprefix: str='', sesprefix: str='', folde
sourcefolder = sourcefolder.parent
else:
LOGGER.error(f"Unexpected dicomsource argument '{sourcefolder}', aborting dicomsort()...")
return []
return set()
elif not sourcefolder.is_dir():
LOGGER.error(f"Sourcefolder '{sourcefolder}' not found")
return []
return set()
if (folderscheme and not validscheme(folderscheme)) or (namescheme and not validscheme(namescheme)):
LOGGER.error('Wrong scheme input argument(s), aborting dicomsort()...')
return []
if not subprefix: subprefix = ''
if not sesprefix: sesprefix = ''
return set()

# Do a recursive call if a sub- or ses-prefix is given
sessions: Set[Path] = set() # Collect the sorted session-folders
if subprefix or sesprefix:
LOGGER.info(f"Searching for subject/session folders in: {sourcefolder}")
for subjectfolder in lsdirs(sourcefolder, (subprefix or '') + '*'):
for sessionfolder in lsdirs(subjectfolder, sesprefix + '*') if sesprefix else [subjectfolder]:
sessions.update(sortsessions(sessionfolder, '', sesprefix, folderscheme, namescheme, pattern, recursive, force, dryrun))

# Use the DICOMDIR file if it is there
sessions = [] # Collect the sorted session-folders
if (sourcefolder/'DICOMDIR').is_file():
elif (sourcefolder/'DICOMDIR').is_file():
LOGGER.info(f"Reading: {sourcefolder/'DICOMDIR'}")
dicomdir = fileset.FileSet(sourcefolder/'DICOMDIR')
for patientid in dicomdir.find_values('PatientID'):
Expand All @@ -180,32 +185,21 @@ def sortsessions(sourcefolder: Path, subprefix: str='', sesprefix: str='', folde
study = dicomdir.find(PatientID=patientid, StudyInstanceUID=studyuid)
dicomfiles = [Path(instance.path) for instance in study]
if dicomfiles:
sessionfolder = sourcefolder/f"{subprefix}{cleanup(patient[0].PatientName)}"/f"{sesprefix}{n:02}-{cleanup(study[0].StudyDescription)}"
if subprefix is '': # == '' -> Recursive call of sortsessions() -> Sort directly in the sourcefolder
sessionfolder = sourcefolder
else: # CLI call -> Sort in subject/session folder
sessionfolder = sourcefolder/f"{subprefix or ''}{cleanup(patient[0].PatientName)}"/f"{sesprefix or ''}{n:02}-{cleanup(study[0].StudyDescription)}"
sortsession(sessionfolder, dicomfiles, folderscheme, namescheme, force, dryrun)
sessions.append(sessionfolder)

# Do a recursive call if a sub- or ses-prefix is given
elif subprefix or sesprefix:
LOGGER.info(f"Searching for subject/session folders in: {sourcefolder}")
for subjectfolder in lsdirs(sourcefolder, subprefix + '*'):
if sesprefix:
sessionfolders = lsdirs(subjectfolder, sesprefix + '*')
else:
sessionfolders = [subjectfolder]
for sessionfolder in sessionfolders:
sessions += sortsessions(sessionfolder, folderscheme=folderscheme, namescheme=namescheme, pattern=pattern, recursive=recursive, dryrun=dryrun)
sessions.add(sessionfolder)

# Sort the DICOM files in the sourcefolder
else:
sessions = [sourcefolder]
if recursive:
dicomfiles = [dcmfile for dcmfile in sourcefolder.rglob('*') if dcmfile.is_file() and re.match(pattern, str(dcmfile))]
else:
dicomfiles = [dcmfile for dcmfile in sourcefolder.iterdir() if dcmfile.is_file() and re.match(pattern, str(dcmfile))]
dicomfiles = [dcmfile for dcmfile in sourcefolder.glob('**/*' if recursive else '*') if dcmfile.is_file() and re.match(pattern, str(dcmfile))]
if dicomfiles:
sortsession(sourcefolder, dicomfiles, folderscheme, namescheme, force, dryrun)
sessions.add(sourcefolder)

return sorted(set(sessions))
return sessions


def main():
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def raw_dicomdir(tmp_path_factory):
raw = tmp_path_factory.mktemp('raw_dicomdir')
shutil.copytree(Path(get_testdata_file('DICOMDIR')).parent, raw, dirs_exist_ok=True)
shutil.rmtree(raw/'TINY_ALPHA') # Does not contain normal DICOM fields / evokes warnings or errors
dicomsort.sortsessions(raw/'DICOMDIR') # The bidsmapper/coiner are NOT picking up the multi-subject DICOMDIR data properly :-(
dicomsort.sortsessions(raw/'DICOMDIR', None) # The bidsmapper/coiner are NOT picking up the multi-subject DICOMDIR data properly :-(
sourcesidecar = raw/'Doe^Peter'/'03-Brain'/'002-TSC RF FAST PILOT/4950.json'
with sourcesidecar.open('w') as sidecar:
json.dump({'SeriesDescription': 'TestExtAtrributes', 'Comment': 'TestExtComment'}, sidecar)
Expand Down
16 changes: 8 additions & 8 deletions tests/test_bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ def test_dynamicvalue(self, datasource):
assert datasource.dynamicvalue(r'<(0010, 0010)>') == 'CompressedSamplesMR1'

def test_unpack(dicomdir, tmp_path):
unpacked = bids.unpack(dicomdir.parent, '', tmp_path)
assert unpacked[1]
assert len(unpacked[0]) == 6
for ses in unpacked[0]:
assert 'Doe^Archibald' in ses.parts or 'Doe^Peter' in ses.parts
sessions, unpacked = bids.unpack(dicomdir.parent, '', tmp_path, None)
assert unpacked
assert len(sessions) == 6
for session in sessions:
assert 'Doe^Archibald' in session.parts or 'Doe^Peter' in session.parts


def test_is_dicomfile(dcm_file):
Expand Down Expand Up @@ -403,7 +403,7 @@ def test_increment_runindex__runless_exist(tmp_path):

# Check the results
assert bidsname == 'sub-01_run-2_T1w'
assert (outfolder / 'sub-01_T1w').with_suffix(suffix).is_file() is True
assert (outfolder/'sub-01_T1w').with_suffix(suffix).is_file() is True


def test_increment_runindex__runless_run2_exist(tmp_path):
Expand All @@ -421,8 +421,8 @@ def test_increment_runindex__runless_run2_exist(tmp_path):

# Check the results
assert bidsname == 'sub-01_run-3_T1w.nii.gz'
assert (outfolder / 'sub-01_T1w').with_suffix(suffix).is_file() is True
assert (outfolder / 'sub-01_run-2_T1w').with_suffix(suffix).is_file() is True
assert (outfolder/'sub-01_T1w').with_suffix(suffix).is_file() is True
assert (outfolder/'sub-01_run-2_T1w').with_suffix(suffix).is_file() is True


def test_increment_runindex__run1_run2_exist(tmp_path):
Expand Down
20 changes: 10 additions & 10 deletions tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,19 @@

def test_dicomsort(tmp_path):
shutil.copytree(Path(get_testdata_file('DICOMDIR')).parent, tmp_path, dirs_exist_ok=True)
session = dicomsort.sortsessions(tmp_path/'DICOMDIR', folderscheme='{SeriesNumber:04d}-{SeriesDescription}', namescheme='{SeriesNumber:02d}_{SeriesDescription}_{AcquisitionNumber}_{InstanceNumber}.IMA', force=True)
assert dicomsort.sortsessions(tmp_path/'DICOMDIR', folderscheme='{SeriesNumber:04d]-{SeriesDescription}') == [] # Invalid scheme -> clean return
session = sorted(dicomsort.sortsessions(tmp_path/'DICOMDIR', None, folderscheme='{SeriesNumber:04d}-{SeriesDescription}', namescheme='{SeriesNumber:02d}_{SeriesDescription}_{AcquisitionNumber}_{InstanceNumber}.IMA', force=True))
assert dicomsort.sortsessions(tmp_path/'DICOMDIR', None, folderscheme='{SeriesNumber:04d]-{SeriesDescription}') == set() # Invalid scheme -> clean return
assert dicomsort.validscheme('{foo:04d}-{123}') == True
assert dicomsort.validscheme('{foo:04d]-{bar}') == False
assert dicomsort.validscheme('{foo:04}-{bar}') == False
assert (tmp_path/'Doe^Peter').is_dir() # Subject (Patient): 98890234 -> Doe^Peter
assert (tmp_path/'Doe^Archibald').is_dir() # 77654033 -> Doe^Archibald
assert len(list((tmp_path/'Doe^Archibald').rglob('*'))) == 13 # 6 directories + 7 files
assert len(list((tmp_path/'Doe^Peter').rglob('*'))) == 37 # 13 directories + 24 files
assert session[0].parent.name == 'Doe^Archibald' # Subject (Patient)
assert session[0].name == '01-XR C Spine Comp Min 4 Views' # Session (Study)
assert sorted(sorted(session[0].iterdir())[0].iterdir())[0].parent.name == '0001-Cervical LAT' # Series -> folderscheme
assert sorted(sorted(session[0].iterdir())[0].iterdir())[0].name == '01_Cervical LAT__1.IMA' # File -> namescheme
assert (tmp_path/'Doe^Peter').is_dir() # Subject (Patient): 98890234 -> Doe^Peter
assert (tmp_path/'Doe^Archibald').is_dir() # 77654033 -> Doe^Archibald
assert len(list((tmp_path/'Doe^Archibald').rglob('*'))) == 13 # 6 directories + 7 files
assert len(list((tmp_path/'Doe^Peter').rglob('*'))) == 37 # 13 directories + 24 files
assert session[0].parent.name == 'Doe^Archibald' # Subject (Patient)
assert session[0].name == '01-XR C Spine Comp Min 4 Views' # Session (Study)
assert sorted(sorted(session[0].iterdir())[0].iterdir())[0].parent.name == '0001-Cervical LAT' # Series -> folderscheme
assert sorted(sorted(session[0].iterdir())[0].iterdir())[0].name == '01_Cervical LAT__1.IMA' # File -> namescheme


def test_bidsparticipants(raw_dicomdir, bids_dicomdir, bidsmap_dicomdir):
Expand Down

0 comments on commit ac0b935

Please sign in to comment.