Source code for histsetup

import os
from pkgutil import extend_path
import re
import sys
import getopt
import pandas as pd
import shutil
import matplotlib.pyplot as plt
import numpy as np
from skimage.transform import resize
import h5py

def _detect_img_ext(fnames):
    extensions = [os.path.splitext(f)[-1] for f in fnames]
    extensions = list(set(extensions))
    extension = [ext for ext in extensions if ext not in ['.csv','.tsv','.json']]
    if len(extension) != 1:
        raise Exception("Image extension is ambiguous. You may set the extension name explicitly with the 'ext' argument.")
    return extension[0]


[docs]def downsample_slices(subject_dir, output_dir, ext='', slice_downfactor=1, image_downfactor=1, sep='_',fnumidx=-1): """ Downsample Slices Copy a subset of optionally downsampled 2D images to the output directory. Parameters ---------- subject_dir: str Path to image series output_dir: str Output path ext: str subject image file extension (e.g. 'png') slice_downfactor: int Specifies n, where every nth image will be copied to the output folder. image_downfactor: int Image downsampling factor. Example ------- >>> subject_dir = '/path/to/histology/data >>> output_dir = 'example_outputs' >>> downsample_slices(subject_dir, output_dir, ext='png', slice_downfactor=20, image_downfactor=32) Raises ------ FileNotFoundError If the subject directory does not exist. """ try: fnames = os.listdir(subject_dir) except FileNotFoundError: print('subject directory does not exist') sys.exit(1) if len(fnames) == 0: print('subject directory is empty') sys.exit(1) if ext == '': ext = _detect_img_ext(fnames) if not os.path.exists(output_dir): os.makedirs(output_dir) # copy geometry.csv to output if it exists if 'geometry.csv' in fnames: shutil.copy(os.path.join(subject_dir, 'geometry.csv'), output_dir) fnames = [i for i in fnames if i.endswith(ext)] fnames = sorted(fnames, key=lambda x: x.split(sep)[fnumidx][:4]) # get the number of slices (missing slices included) max_slice = int(fnames[-1].split('_')[-1][:4]) # get list of slices to be kept after downsampling (these are evenly spaced numbers that may include missing slices) slices_to_keep = np.arange(1, max_slice+1, slice_downfactor) # Find paths to existing slices in the downsampled list (intersection of existing slices and downsampled slices) slice_nums = [int(f.split('_')[-1][:4]) for f in fnames] slice_nums_down = [x for x in slices_to_keep if x in slice_nums] fnames_down = [f for f in fnames if int(f.split('_')[-1][:4]) in slice_nums_down] fpaths_down = [os.path.join(subject_dir,x) for x in fnames_down] for i in range(len(fpaths_down)): # downsample images if specified if image_downfactor > 1: img = plt.imread(fpaths_down[i]) img_resized = resize(img, (img.shape[0] // image_downfactor, img.shape[1] // image_downfactor),\ anti_aliasing=True, preserve_range=False) plt.imsave(os.path.join(output_dir, os.path.splitext(os.path.basename(fpaths_down[i]))[0]+'.png'), img_resized) else: shutil.copy(fpaths_down[i], output_dir)
[docs]def make_samples_tsv(subject_dir, ext='', slice_downfactor=1, max_slice=None, sep='_',fnumidx=-1): """ Make 'samples.tsv' file Saves a tsv file listing the images in the folder. Parameters ---------- subject_dir: path Path to the dataset. ext: str image file extension (e.g. 'png') max_slice: int Number of slices in the original dataset (not currently used) Example ------- >>> subject_dir = 'example_outputs' >>> make_samples_tsv(subject_dir, 'png') Raises ------ FileNotFoundError If the subject directory does not exist. """ # create list of files from source directory (subject_dir) in order of slice number fnames = os.listdir(subject_dir) if len(fnames) == 0: print('subject directory is empty') sys.exit(1) if ext == '': ext = _detect_img_ext(fnames) fnames = [i for i in fnames if i.endswith(ext)] fnames = sorted(fnames, key=lambda x: x.split(sep)[fnumidx][:4]) # uncomment this block to include missing images in tsv file (not currently implemented for registration) # get missing image numbers img_nums = [int(x.split('_')[-1][:4]) for x in fnames] if not max_slice: max_slice = img_nums[-1] slice_range = np.arange(1, max_slice+1, slice_downfactor) missing_imgs = list(set(img_nums) - set(slice_range)) + list(set(slice_range) - set(img_nums)) missing_imgs = sorted(missing_imgs) print("missing images: \n", missing_imgs) # insert missing image names into fnames missing_ids = [] for i in range(len(missing_imgs)): fname = fnames[0].split('-')[0] + '-_' + str(missing_imgs[i]).zfill(4) + '.' + fnames[0].split('.')[-1] N = len(img_nums) for j in range(N): if img_nums[j] > missing_imgs[i]: fnames.insert(j, fname) img_nums.insert(j, missing_imgs[i]) missing_ids.append(j) break elif j==(N-1): fnames.append(fname) img_nums.append(missing_imgs[i]) missing_ids.append(j+1) break # make samples.tsv file with open(os.path.join(subject_dir, 'samples.tsv'), 'w') as f: # f.write('sample_id participant_id species status\n') f.write('sample_id\tparticipant_id\tspecies\tstatus\n') status = 'present' j = 0 for i in range(len(fnames)): status = 'present' for idx in missing_ids: if i == idx: status = 'missing' break f.write('{0}\t{1}\tMus Musculus\t{2}\n'.format(fnames[i], fnames[i][:5], status))
# Set up metadata JSON file # Need Pixel Size, field of view, thickness, offset
[docs]def generate_sidecars(subject_dir, ext='', max_slice=None, dtype='uint8', dv=[14.72,14.72,10.], slice_downfactor=1, sep='_', fnumidx=-1, space='right-inferior-posterior'): """ Generate Sidecar Files Saves out JSON format sidecare files for each image in the dataset. Parameters ---------- subject_dir: str Path to image series ext: str image file extension (e.g. 'png') max_slice: int Number of slices in the original dataset (used to calcuate spatial information for the slices). dtype: str Image data type dv: list of float voxel spacing in microns (ordered: row, col, slice) slice_downfactor: int Factor used to reduce the number of images from the original dataset. Example ------- >>> subject_dir = 'example_outputs' >>> generate_sidecars(subject_dir, 'png', max_slice=1389, dv=[14.72,14.72,10.0]) MD787-N1-2019.03.28-21.52.46_MD787_1_0001.json MD787-N7-2019.03.28-22.05.43_MD787_3_0021.json MD787-N14-2019.03.28-22.20.46_MD787_2_0041.json ... Raises ------ FileNotFoundError If the subject directory does not exist. """ fnames = os.listdir(subject_dir) # Make sure the directory is not empty if len(fnames) == 0: print('subject directory is empty') sys.exit(1) if ext == '': ext = _detect_img_ext(fnames) sizes = [] space_directions = [] space_origin = [] geometry = 'geometry.csv' in fnames fnames = [i for i in fnames if i.endswith(ext)] # this removes geometry.csv from fnames fnames = sorted(fnames, key=lambda x: x.split(sep)[fnumidx][:4]) # get geometry info if geometry: # first get geometry info geometry = pd.read_csv(os.path.join(subject_dir, 'geometry.csv'), index_col=False) dv = [] for i in range(len(geometry)): dv.append([geometry.iloc[i, 4], geometry.iloc[i, 5], geometry.iloc[i, 6]]) sizes.append([4, geometry.iloc[i, 1], geometry.iloc[i, 2], geometry.iloc[i, 3]]) space_directions.append([[dv[i][0], 0., 0.], [0., dv[i][1], 0.], [0., 0., dv[i][2]*slice_downfactor]]) space_origin.append([geometry.iloc[i, 7], geometry.iloc[i, 8], geometry.iloc[i, 9]]) else: if not max_slice: max_slice = int(fnames[-1].split(sep)[fnumidx][:4]) dx, dy, dz = dv center = (max_slice - 1) / 2 for i in range(len(fnames)): slice_num = int(fnames[i].split('_')[-1][:4]) img_path = os.path.join(subject_dir, fnames[i]) try: img_shape = plt.imread(img_path).shape except: if 'h5' in ext: with h5py.File(img_path,'r') as f: img_shape = list(f[list(f.keys())[0]][:].shape[:2]) img_shape.append(1) # append 1 to represent channels else: print('file type not recognized') x_origin = -(img_shape[1] - 1) / 2 * dx y_origin = -(img_shape[0] - 1) / 2 * dy space_origin.append([x_origin, y_origin, dz / slice_downfactor * (slice_num-1 - center)]) sizes.append([img_shape[2], img_shape[1], img_shape[0], 1]) space_directions.append([[dx, 0., 0.], [0., dy, 0.], [0., 0., dz]]) sample_ids = [os.path.splitext(x)[0] for x in fnames] for i in range(len(fnames)): print(sample_ids[i] + '.json') img_path = os.path.join(subject_dir, fnames[i]) with open(os.path.join(subject_dir, sample_ids[i] + '.json'), 'w') as f: f.write('{{\n' ' \"DataFile\": \"{0}\",\n' ' \"Type\": \"{1}\",\n' ' \"Dimension\": 4,\n' ' \"Sizes\": {2},\n' ' \"Endian\": \"big\",\n' ' \"Space\": \"{3}\",\n' ' \"SpaceDimension\": 3,\n' ' \"SpaceUnits\": [\"um\", \"um\", \"um\"],\n' ' \"SpaceDirections\": ["none", {4}, {5}, {6}],\n' ' \"SpaceOrigin\": {7}\n' '}}'.format(fnames[i], dtype, sizes[i], space, space_directions[i][0], space_directions[i][1], space_directions[i][2], space_origin[i]))
[docs]def main(): subject_dir = '.' output_dir = '.' ext = '' slice_down = 1 res_down = 1 max_slice = None dv = [14.72, 14.72, 10.] space = "right-inferior-posterior" try: opts, args = getopt.getopt(sys.argv[1:], "hi:o:d:e:s:r:m:") except getopt.GetoptError: print('histsetup.py -i <inputDir> -o <outputDir> -d <dv> -e <fileExtension> -s <sliceDownFactor> -r <resolutionDownFactor> -m <maxSlice> -p <space>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('histsetup.py -i <inputDir> -o <outputDir> -d <dv> -e <fileExtension> -s <sliceDownFactor> -r <resolutionDownFactor> -m <maxSlice> -p <space>') sys.exit() elif opt == '-i': subject_dir = arg elif opt == '-o': output_dir = arg elif opt == '-d': dv = list(map(float, arg[1:-1].split(','))) elif opt == '-e': ext = arg elif opt == '-s': slice_down = int(arg) elif opt == '-r': res_down = int(arg) elif opt == '-m': max_slice = int(arg) elif opt == 'p': space = str(arg) dv = [dv[0]*res_down, dv[1]*res_down, dv[2]*slice_down] print('subject directory: ', subject_dir, '\n', 'output directory: ', output_dir, '\n', 'slice downsample factor: ', slice_down, '\n', 'resolution downsample factor: ', res_down, '\n', 'maximum slice number: ', max_slice, '\n', 'dv: ', dv) downsample_slices(subject_dir, output_dir, ext, slice_downfactor=slice_down, image_downfactor=res_down) if res_down > 1: ext = 'png' make_samples_tsv(output_dir, ext=ext, slice_downfactor=slice_down, max_slice=max_slice) generate_sidecars(output_dir, ext=ext, max_slice=max_slice, dv=dv, slice_downfactor=slice_down, space=space)
if __name__ == '__main__': main()