import os
from pkgutil import extend_path
import re
import sys
import getopt
import pandas as pd
import shutil
import matplotlib.pyplot as plt
import numpy as np
from skimage.transform import resize
import h5py
def _detect_img_ext(fnames):
extensions = [os.path.splitext(f)[-1] for f in fnames]
extensions = list(set(extensions))
extension = [ext for ext in extensions if ext not in ['.csv','.tsv','.json']]
if len(extension) != 1:
raise Exception("Image extension is ambiguous. You may set the extension name explicitly with the 'ext' argument.")
return extension[0]
[docs]def downsample_slices(subject_dir, output_dir, ext='', slice_downfactor=1, image_downfactor=1, sep='_',fnumidx=-1):
""" Downsample Slices
Copy a subset of optionally downsampled 2D images to the output directory.
Parameters
----------
subject_dir: str
Path to image series
output_dir: str
Output path
ext: str
subject image file extension (e.g. 'png')
slice_downfactor: int
Specifies n, where every nth image will be copied to the output folder.
image_downfactor: int
Image downsampling factor.
Example
-------
>>> subject_dir = '/path/to/histology/data
>>> output_dir = 'example_outputs'
>>> downsample_slices(subject_dir, output_dir, ext='png', slice_downfactor=20, image_downfactor=32)
Raises
------
FileNotFoundError
If the subject directory does not exist.
"""
try:
fnames = os.listdir(subject_dir)
except FileNotFoundError:
print('subject directory does not exist')
sys.exit(1)
if len(fnames) == 0:
print('subject directory is empty')
sys.exit(1)
if ext == '':
ext = _detect_img_ext(fnames)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# copy geometry.csv to output if it exists
if 'geometry.csv' in fnames:
shutil.copy(os.path.join(subject_dir, 'geometry.csv'), output_dir)
fnames = [i for i in fnames if i.endswith(ext)]
fnames = sorted(fnames, key=lambda x: x.split(sep)[fnumidx][:4])
# get the number of slices (missing slices included)
max_slice = int(fnames[-1].split('_')[-1][:4])
# get list of slices to be kept after downsampling (these are evenly spaced numbers that may include missing slices)
slices_to_keep = np.arange(1, max_slice+1, slice_downfactor)
# Find paths to existing slices in the downsampled list (intersection of existing slices and downsampled slices)
slice_nums = [int(f.split('_')[-1][:4]) for f in fnames]
slice_nums_down = [x for x in slices_to_keep if x in slice_nums]
fnames_down = [f for f in fnames if int(f.split('_')[-1][:4]) in slice_nums_down]
fpaths_down = [os.path.join(subject_dir,x) for x in fnames_down]
for i in range(len(fpaths_down)):
# downsample images if specified
if image_downfactor > 1:
img = plt.imread(fpaths_down[i])
img_resized = resize(img, (img.shape[0] // image_downfactor, img.shape[1] // image_downfactor),\
anti_aliasing=True, preserve_range=False)
plt.imsave(os.path.join(output_dir, os.path.splitext(os.path.basename(fpaths_down[i]))[0]+'.png'), img_resized)
else:
shutil.copy(fpaths_down[i], output_dir)
[docs]def make_samples_tsv(subject_dir, ext='', slice_downfactor=1, max_slice=None, sep='_',fnumidx=-1):
""" Make 'samples.tsv' file
Saves a tsv file listing the images in the folder.
Parameters
----------
subject_dir: path
Path to the dataset.
ext: str
image file extension (e.g. 'png')
max_slice: int
Number of slices in the original dataset (not currently used)
Example
-------
>>> subject_dir = 'example_outputs'
>>> make_samples_tsv(subject_dir, 'png')
Raises
------
FileNotFoundError
If the subject directory does not exist.
"""
# create list of files from source directory (subject_dir) in order of slice number
fnames = os.listdir(subject_dir)
if len(fnames) == 0:
print('subject directory is empty')
sys.exit(1)
if ext == '':
ext = _detect_img_ext(fnames)
fnames = [i for i in fnames if i.endswith(ext)]
fnames = sorted(fnames, key=lambda x: x.split(sep)[fnumidx][:4])
# uncomment this block to include missing images in tsv file (not currently implemented for registration)
# get missing image numbers
img_nums = [int(x.split('_')[-1][:4]) for x in fnames]
if not max_slice:
max_slice = img_nums[-1]
slice_range = np.arange(1, max_slice+1, slice_downfactor)
missing_imgs = list(set(img_nums) - set(slice_range)) + list(set(slice_range) - set(img_nums))
missing_imgs = sorted(missing_imgs)
print("missing images: \n", missing_imgs)
# insert missing image names into fnames
missing_ids = []
for i in range(len(missing_imgs)):
fname = fnames[0].split('-')[0] + '-_' + str(missing_imgs[i]).zfill(4) + '.' + fnames[0].split('.')[-1]
N = len(img_nums)
for j in range(N):
if img_nums[j] > missing_imgs[i]:
fnames.insert(j, fname)
img_nums.insert(j, missing_imgs[i])
missing_ids.append(j)
break
elif j==(N-1):
fnames.append(fname)
img_nums.append(missing_imgs[i])
missing_ids.append(j+1)
break
# make samples.tsv file
with open(os.path.join(subject_dir, 'samples.tsv'), 'w') as f:
# f.write('sample_id participant_id species status\n')
f.write('sample_id\tparticipant_id\tspecies\tstatus\n')
status = 'present'
j = 0
for i in range(len(fnames)):
status = 'present'
for idx in missing_ids:
if i == idx:
status = 'missing'
break
f.write('{0}\t{1}\tMus Musculus\t{2}\n'.format(fnames[i], fnames[i][:5], status))
# Set up metadata JSON file
# Need Pixel Size, field of view, thickness, offset
[docs]def generate_sidecars(subject_dir, ext='', max_slice=None, dtype='uint8', dv=[14.72,14.72,10.], slice_downfactor=1, sep='_', fnumidx=-1, space='right-inferior-posterior'):
""" Generate Sidecar Files
Saves out JSON format sidecare files for each image in the dataset.
Parameters
----------
subject_dir: str
Path to image series
ext: str
image file extension (e.g. 'png')
max_slice: int
Number of slices in the original dataset (used to calcuate spatial information for the slices).
dtype: str
Image data type
dv: list of float
voxel spacing in microns (ordered: row, col, slice)
slice_downfactor: int
Factor used to reduce the number of images from the original dataset.
Example
-------
>>> subject_dir = 'example_outputs'
>>> generate_sidecars(subject_dir, 'png', max_slice=1389, dv=[14.72,14.72,10.0])
MD787-N1-2019.03.28-21.52.46_MD787_1_0001.json
MD787-N7-2019.03.28-22.05.43_MD787_3_0021.json
MD787-N14-2019.03.28-22.20.46_MD787_2_0041.json
...
Raises
------
FileNotFoundError
If the subject directory does not exist.
"""
fnames = os.listdir(subject_dir)
# Make sure the directory is not empty
if len(fnames) == 0:
print('subject directory is empty')
sys.exit(1)
if ext == '':
ext = _detect_img_ext(fnames)
sizes = []
space_directions = []
space_origin = []
geometry = 'geometry.csv' in fnames
fnames = [i for i in fnames if i.endswith(ext)] # this removes geometry.csv from fnames
fnames = sorted(fnames, key=lambda x: x.split(sep)[fnumidx][:4])
# get geometry info
if geometry:
# first get geometry info
geometry = pd.read_csv(os.path.join(subject_dir, 'geometry.csv'), index_col=False)
dv = []
for i in range(len(geometry)):
dv.append([geometry.iloc[i, 4], geometry.iloc[i, 5], geometry.iloc[i, 6]])
sizes.append([4, geometry.iloc[i, 1], geometry.iloc[i, 2], geometry.iloc[i, 3]])
space_directions.append([[dv[i][0], 0., 0.], [0., dv[i][1], 0.], [0., 0., dv[i][2]*slice_downfactor]])
space_origin.append([geometry.iloc[i, 7], geometry.iloc[i, 8], geometry.iloc[i, 9]])
else:
if not max_slice:
max_slice = int(fnames[-1].split(sep)[fnumidx][:4])
dx, dy, dz = dv
center = (max_slice - 1) / 2
for i in range(len(fnames)):
slice_num = int(fnames[i].split('_')[-1][:4])
img_path = os.path.join(subject_dir, fnames[i])
try:
img_shape = plt.imread(img_path).shape
except:
if 'h5' in ext:
with h5py.File(img_path,'r') as f:
img_shape = list(f[list(f.keys())[0]][:].shape[:2])
img_shape.append(1) # append 1 to represent channels
else:
print('file type not recognized')
x_origin = -(img_shape[1] - 1) / 2 * dx
y_origin = -(img_shape[0] - 1) / 2 * dy
space_origin.append([x_origin, y_origin, dz / slice_downfactor * (slice_num-1 - center)])
sizes.append([img_shape[2], img_shape[1], img_shape[0], 1])
space_directions.append([[dx, 0., 0.], [0., dy, 0.], [0., 0., dz]])
sample_ids = [os.path.splitext(x)[0] for x in fnames]
for i in range(len(fnames)):
print(sample_ids[i] + '.json')
img_path = os.path.join(subject_dir, fnames[i])
with open(os.path.join(subject_dir, sample_ids[i] + '.json'), 'w') as f:
f.write('{{\n'
' \"DataFile\": \"{0}\",\n'
' \"Type\": \"{1}\",\n'
' \"Dimension\": 4,\n'
' \"Sizes\": {2},\n'
' \"Endian\": \"big\",\n'
' \"Space\": \"{3}\",\n'
' \"SpaceDimension\": 3,\n'
' \"SpaceUnits\": [\"um\", \"um\", \"um\"],\n'
' \"SpaceDirections\": ["none", {4}, {5}, {6}],\n'
' \"SpaceOrigin\": {7}\n'
'}}'.format(fnames[i], dtype, sizes[i], space, space_directions[i][0], space_directions[i][1],
space_directions[i][2],
space_origin[i]))
[docs]def main():
subject_dir = '.'
output_dir = '.'
ext = ''
slice_down = 1
res_down = 1
max_slice = None
dv = [14.72, 14.72, 10.]
space = "right-inferior-posterior"
try:
opts, args = getopt.getopt(sys.argv[1:], "hi:o:d:e:s:r:m:")
except getopt.GetoptError:
print('histsetup.py -i <inputDir> -o <outputDir> -d <dv> -e <fileExtension> -s <sliceDownFactor> -r <resolutionDownFactor> -m <maxSlice> -p <space>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('histsetup.py -i <inputDir> -o <outputDir> -d <dv> -e <fileExtension> -s <sliceDownFactor> -r <resolutionDownFactor> -m <maxSlice> -p <space>')
sys.exit()
elif opt == '-i':
subject_dir = arg
elif opt == '-o':
output_dir = arg
elif opt == '-d':
dv = list(map(float, arg[1:-1].split(',')))
elif opt == '-e':
ext = arg
elif opt == '-s':
slice_down = int(arg)
elif opt == '-r':
res_down = int(arg)
elif opt == '-m':
max_slice = int(arg)
elif opt == 'p':
space = str(arg)
dv = [dv[0]*res_down, dv[1]*res_down, dv[2]*slice_down]
print('subject directory: ', subject_dir, '\n',
'output directory: ', output_dir, '\n',
'slice downsample factor: ', slice_down, '\n',
'resolution downsample factor: ', res_down, '\n',
'maximum slice number: ', max_slice, '\n',
'dv: ', dv)
downsample_slices(subject_dir, output_dir, ext, slice_downfactor=slice_down, image_downfactor=res_down)
if res_down > 1:
ext = 'png'
make_samples_tsv(output_dir, ext=ext, slice_downfactor=slice_down, max_slice=max_slice)
generate_sidecars(output_dir, ext=ext, max_slice=max_slice, dv=dv, slice_downfactor=slice_down, space=space)
if __name__ == '__main__':
main()