diff --git a/src/sparcstools/utils/dataset_manipulation.py b/src/sparcstools/utils/dataset_manipulation.py index 6739005..0cdecc7 100644 --- a/src/sparcstools/utils/dataset_manipulation.py +++ b/src/sparcstools/utils/dataset_manipulation.py @@ -6,6 +6,7 @@ from rasterio.features import rasterize import numpy.ma as ma from tqdm.auto import tqdm +import shutil def get_indexes(project_location, cell_ids, return_annotation = False): @@ -20,16 +21,22 @@ def get_indexes(project_location, cell_ids, return_annotation = False): index_locs = [] for cell_id in tqdm(cell_ids, desc = "getting indexes"): - index_locs.append(lookup[cell_id]) + try: + index_locs.append(lookup[cell_id]) + except KeyError: + index_locs.append(np.nan) + print(f"cell_id {cell_id} not found in dataset. Skipping...") + continue if return_annotation: annotation = pd.DataFrame({"index_hdf5": index_locs, "cell_id":cell_ids}) annotation = annotation.sort_values("index_hdf5") + annotation = annotation.dropna() #remove nan values return(np.array(annotation.index_hdf5.tolist()), np.array(annotation.cell_id.tolist())) else: return(np.sort(index_locs)) -def save_cells_to_new_hdf5(project_location, name, cell_ids, annotation = "selected_cells", append = False): +def save_cells_to_new_hdf5(project_location, name, cell_ids, annotation = "selected_cells", append = False, temp_dir = "/tmp"): #get output directory outdir = f"{project_location}/extraction/filtered_data/{name}/" @@ -49,9 +56,21 @@ def save_cells_to_new_hdf5(project_location, name, cell_ids, annotation = "selec #get cell images we want to write to new location print("getting cell images for selected cells...") + with h5py.File(f"{project_location}/extraction/data/single_cells.h5", "r") as hf_in: - cell_images = hf_in.get("single_cell_data")[indexes] - + + #generate container for single_cell_data + from alphabase.io import tempmmap + TEMP_DIR_NAME = tempmmap.redefine_temp_location(temp_dir) + _, c, x, y = hf_in["single_cell_data"].shape + cell_images = tempmmap.array(shape = (len(indexes), c, x, y), dtype = np.float16) + + #actually add the single_cell_data to the container + i = 0 + for index in tqdm(indexes): + cell_images[i] = hf_in.get("single_cell_data")[int(index)] + i += 1 + #delete file if append is False and it already exists so that we generate a new one if not append: if os.path.isfile(outfile): @@ -88,7 +107,6 @@ def save_cells_to_new_hdf5(project_location, name, cell_ids, annotation = "selec #create index file hf_out.create_dataset('single_cell_index', (n_cells, 2), maxshape = (None, 2), dtype="uint64") - #create dataset hf_out.create_dataset('single_cell_data', (n_cells, n_channels, x, y), @@ -105,6 +123,10 @@ def save_cells_to_new_hdf5(project_location, name, cell_ids, annotation = "selec hf_out.get("annotation")[:] = annotation_df print("results saved.") + #cleanup temp directories + del cell_images + shutil.rmtree(TEMP_DIR_NAME, ignore_errors=True) + def _read_napari_csv(path): # read csv table shapes = pd.read_csv(path, sep = ",") @@ -117,8 +139,15 @@ def _read_napari_csv(path): for shape_id in shape_ids: _shapes = shapes.loc[shapes.index_shape == shape_id] - x = _shapes["axis-0"].tolist() - y = _shapes["axis-1"].tolist() + x = _shapes["axis-1"].tolist() + y = _shapes["axis-0"].tolist() + + if len(x) < 3: + print("shape with less than 3 points found. Skipping...") + continue + if len(y) < 3: + print("shape with less than 3 points found. Skipping...") + continue polygon = Polygon(zip(x, y)) polygons.append(polygon) @@ -130,7 +159,7 @@ def _generate_mask_polygon(poly, outshape): img = rasterize(poly, out_shape = (x, y)) return(img.astype("bool")) -def extract_single_cells_napari_area(napari_path, project_location): +def extract_single_cells_napari_area(napari_path, project_location, temp_dir = "/tmp"): #get name from naparipath name = os.path.basename(napari_path).split(".")[0] @@ -171,4 +200,4 @@ def extract_single_cells_napari_area(napari_path, project_location): cell_ids = list(cell_ids) #need list type for lookup of indexes print(f"found {len(cell_ids)} unique cell_ids in selected area. Will now export to new HDF5 single cell dataset.") - save_cells_to_new_hdf5(project_location, cell_ids = cell_ids, name = name, annotation = name, append = False) + save_cells_to_new_hdf5(project_location, cell_ids = cell_ids, name = name, annotation = name, append = False, temp_dir=temp_dir)