Skip to content

Commit

Permalink
Code documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
axdanbol committed Dec 13, 2023
1 parent 93d85ee commit 160d3d8
Show file tree
Hide file tree
Showing 11 changed files with 519 additions and 36 deletions.
74 changes: 68 additions & 6 deletions containers/azimuth/context/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self):
super().__init__(OrganLookup)

def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
"""Annotate data using azimuth."""
data = anndata.read_h5ad(matrix)
reference_data = self.find_reference_data(organ, options["reference_data_dir"])
annotation_level = self.find_annotation_level(
Expand All @@ -42,23 +43,59 @@ def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):

return data, annotation_level

def create_clean_matrix(self, matrix: anndata.AnnData):
def create_clean_matrix(self, matrix: anndata.AnnData) -> anndata.AnnData:
"""Creates a copy of the data with all observation columns removed.
Args:
matrix (anndata.AnnData): Original data
Returns:
anndata.AnnData: Cleaned data
"""
clean_obs = pandas.DataFrame(index=matrix.obs.index)
clean_matrix = matrix.copy()
clean_matrix.obs = clean_obs
return clean_matrix

def copy_annotations(
self, matrix: anndata.AnnData, annotated_matrix: anndata.AnnData
):
) -> None:
"""Copies annotations from one matrix to another.
Args:
matrix (anndata.AnnData): Matrix to copy to
annotated_matrix (anndata.AnnData): Matrix to copy from
"""
matrix.obs = matrix.obs.join(annotated_matrix.obs, rsuffix="_azimuth")

def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path):
def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str:
"""Creates a subprocess running the Azimuth annotation R script.
Args:
matrix_path (Path): Path to data file
reference_data (Path): Path to organ reference data directory
Returns:
str: Path to the output data file
"""
script_command = ["Rscript", "/run_azimuth.R", matrix_path, reference_data]
subprocess.run(script_command, capture_output=True, check=True, text=True)
return "./result.h5ad"

def find_reference_data(self, organ: str, dir: Path):
def find_reference_data(self, organ: str, dir: Path) -> Path:
"""Finds the reference data directory for an organ.
Args:
organ (str): Organ name
dir (Path): Directory to search
Raises:
ValueError: If no reference data could be found
Returns:
Path: The data directory
"""

def is_reference_data_candidate(path: Path):
return path.is_dir() and organ.lower() in path.name.lower()

Expand All @@ -71,14 +108,39 @@ def is_reference_data_candidate(path: Path):
# idx.annoy and ref.Rds is always located inside an 'azimuth' subdirectory
return subdir / "azimuth"

def find_annotation_level(self, organ: str, path: Path):
def find_annotation_level(self, organ: str, path: Path) -> str:
"""Finds the column name which contains the predictions.
Args:
organ (str): Organ name
path (Path): Path to file containing information about column names
Returns:
str: Column name
"""
with open(path) as file:
levels_by_organ = json.load(file)
return "predicted." + levels_by_organ[organ]

def _find_in_dir(
self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str
):
) -> Path:
"""Search a directory for a entry which passes the provided test.
Args:
dir (Path): Directory to search
cond (t.Callable[[Path], bool]): Test used to match sub entries
error_msg (str): Error message used when no entries match
warn_msg (str): Warning message use when multiple entries match
Raises:
ValueError: If there are no matching sub entries
Returns:
Path:
The matching entry.
If multiple entries match the one with the shortest name is returned.
"""
candidates = list(filter(cond, dir.iterdir()))
candidates.sort(key=lambda path: len(path.name))

Expand Down
33 changes: 32 additions & 1 deletion containers/celltypist/context/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ def __init__(self, mapping_file: Path):
super().__init__(mapping_file)

def get_builtin_options(self):
"""Get builtin celltypist models."""
models = celltypist.models.get_all_models()
return map(lambda model: (model, self.from_raw(model)), models)

def from_raw(self, id: str):
"""Load a celltypist model."""
return celltypist.models.Model.load(id)


Expand All @@ -31,6 +33,7 @@ def __init__(self):
super().__init__(CelltypistOrganLookup, "predicted_labels")

def do_run(self, matrix: Path, organ: celltypist.Model, options: CelltypistOptions):
"""Annotate data using celltypist."""
data = scanpy.read_h5ad(matrix)
data = self.normalize(data)
data, var_names = self.normalize_var_names(data, options)
Expand All @@ -39,6 +42,17 @@ def do_run(self, matrix: Path, organ: celltypist.Model, options: CelltypistOptio
return data

def normalize(self, data: scanpy.AnnData) -> scanpy.AnnData:
"""Normalizes data according to celltypist requirements.
Celltypist requires data to be log1p normalized with 10,000 counts per cell.
See https://github.com/Teichlab/celltypist for details.
Args:
data (scanpy.AnnData): Original data to be normalized
Returns:
scanpy.AnnData: Normalized data
"""
primary_column = "feature_name"
alternative_primary_column = "gene_symbol"
if primary_column not in data.var.columns:
Expand All @@ -58,6 +72,15 @@ def normalize(self, data: scanpy.AnnData) -> scanpy.AnnData:
def normalize_var_names(
self, data: scanpy.AnnData, options: CelltypistOptions
) -> t.Tuple[scanpy.AnnData, pandas.Index]:
"""Normalizes variable names, replacing ensemble ids with the corresponding gene name.
Args:
data (scanpy.AnnData): Data with potentially non-normalized names
options (CelltypistOptions): Options containing the ensemble id mapping file path
Returns:
t.Tuple[scanpy.AnnData, pandas.Index]: The normalized data along with the original names
"""
lookup = self.load_ensemble_lookup(options)
names = data.var_names

Expand All @@ -68,7 +91,15 @@ def getNewName(name: str):
data.var_names = t.cast(t.Any, names.map(getNewName))
return data, names

def load_ensemble_lookup(self, options: CelltypistOptions):
def load_ensemble_lookup(self, options: CelltypistOptions) -> t.Dict[str, str]:
"""Load a file mapping ensemble id to gene names.
Args:
options (CelltypistOptions): Options with the mapping file path
Returns:
t.Dict[str, str]: Loaded mapping
"""
with open(options["ensemble_lookup"]) as file:
reader = csv.DictReader(file)
lookup: t.Dict[str, str] = {}
Expand Down
77 changes: 70 additions & 7 deletions containers/crosswalking/context/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,28 @@


def filter_crosswalk_table(table: pd.DataFrame, *columns: str) -> pd.DataFrame:
"""Filters the table to remove empty rows and keep only necessary columns"""
"""Filter the crosswalk table to only include specified columns.
Also removes empty rows and cast values to string.
Args:
table (pd.DataFrame): Original full crosswalk table
Returns:
pd.DataFrame: Filtered table
"""
return table[list(columns)].dropna().astype(str).drop_duplicates()


def generate_iri(label: str):
"""generate IRIs for labels not found in crosswalk tables"""
def generate_iri(label: str) -> str:
"""Create a temporary IRI based on a label.
Args:
label (str): Label for the row
Returns:
str: Temporary IRI
"""
suffix = label.lower().strip()
suffix = re.sub(r"\W+", "-", suffix)
suffix = re.sub(r"[^a-z0-9-]+", "", suffix)
Expand All @@ -29,7 +45,21 @@ def crosswalk(
table_clid_column: str,
table_match_column: str,
) -> anndata.AnnData:
"""Gives each cell a CL ID and Match type using crosswalk table"""
"""Crosswalks the data adding CLIDs and match types using a crosswalk table.
Args:
matrix (anndata.AnnData): Data to crosswalk
data_label_column (str): Column used to match against the table
data_clid_column (str): Column to store CLIDs in
data_match_column (str): Column to store match type in
table (pd.DataFrame): Crosswalk table
table_label_column (str): Column used to match against the data
table_clid_column (str): Column storing CLIDs
table_match_column (str): Column storing match type
Returns:
anndata.AnnData: Crosswalked data with CLIDs and match type added
"""
column_map = {
table_clid_column: data_clid_column,
table_match_column: data_match_column,
Expand All @@ -54,16 +84,37 @@ def crosswalk(
return result


def _set_default_clid(obs: pd.DataFrame, clid_column: str, label_column: str):
def _set_default_clid(obs: pd.DataFrame, clid_column: str, label_column: str) -> None:
"""Adds default CLIDs to rows that did not match against the crosswalk table.
Args:
obs (pd.DataFrame): Data rows
clid_column (str): Column to check and update with default CLIDs
label_column (str): Column used when generating default CLIDs
"""
defaults = obs.apply(lambda row: generate_iri(row[label_column]), axis=1)
obs.loc[obs[clid_column].isna(), clid_column] = defaults


def _set_default_match(obs: pd.DataFrame, column: str):
def _set_default_match(obs: pd.DataFrame, column: str) -> None:
"""Adds default match type to rows that did not match against the crosswalk table.
Args:
obs (pd.DataFrame): Data rows
column (str): Column to check and update with default match type
"""
obs.loc[obs[column].isna(), column] = "skos:exactMatch"


def _get_empty_table(args: argparse.Namespace) -> pd.DataFrame:
"""Creates an empty crosswalk table.
Args:
args (argparse.Namespace): Same arguments as provided to `main`
Returns:
pd.DataFrame: An empty table
"""
return pd.DataFrame(
columns=[
args.crosswalk_table_label_column,
Expand All @@ -74,12 +125,24 @@ def _get_empty_table(args: argparse.Namespace) -> pd.DataFrame:


def main(args: argparse.Namespace):
"""Crosswalks a h5ad file and saves the result to another h5ad file.
Args:
args (argparse.Namespace):
CLI arguments, must contain "matrix",
"annotation_column", "clid_column", "match_column",
"crosswalk_table", "crosswalk_table_label_column",
"crosswalk_table_clid_column", "crosswalk_table_match_column", and
"output_matrix"
"""
matrix = crosswalk(
args.matrix,
args.annotation_column,
args.clid_column,
args.match_column,
args.crosswalk_table if args.crosswalk_table is not None else _get_empty_table(args),
args.crosswalk_table
if args.crosswalk_table is not None
else _get_empty_table(args),
args.crosswalk_table_label_column,
args.crosswalk_table_clid_column,
args.crosswalk_table_match_column,
Expand Down
39 changes: 37 additions & 2 deletions containers/extract-summary/context/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import json
import typing as t

import anndata
import pandas as pd
Expand All @@ -8,6 +9,15 @@
def get_unique_rows_with_counts(
matrix: anndata.AnnData, clid_column: str
) -> pd.DataFrame:
"""Computes unique CLIDs and the total count for each.
Args:
matrix (anndata.AnnData): Data
clid_column (str): Column with CLIDs
Returns:
pd.DataFrame: A frame with unique CLIDs and counts added
"""
counts = matrix.obs.value_counts(clid_column).reset_index()
counts.columns = [clid_column, "count"]
obs_with_counts = matrix.obs.merge(counts, how="left")
Expand All @@ -20,7 +30,19 @@ def unique_rows_to_summary_rows(
label_column: str,
gene_expr_column: str,
counts_column="count",
):
) -> t.List[dict]:
"""Converts a data frame with unique CLIDs rows into cell summary rows.
Args:
unique (pd.DataFrame): Data with unique CLIDs
clid_column (str): Column with CLIDs
label_column (str): Column with labels
gene_expr_column (str): Column with gene expressions
counts_column (str, optional): Column with the total counts. Defaults to "count".
Returns:
t.List[dict]: A cell summary for each row in the source data
"""
columns = [clid_column, label_column, gene_expr_column, counts_column]
df = unique[columns].rename(
columns={
Expand All @@ -33,11 +55,24 @@ def unique_rows_to_summary_rows(

df["@type"] = "CellSummaryRow"
df["percentage"] = df["count"] / df["count"].sum()
df["gene_expr"] = df["gene_expr"].astype(object).apply(lambda x: [] if pd.isna(x) else json.loads(x))
df["gene_expr"] = (
df["gene_expr"]
.astype(object)
.apply(lambda x: [] if pd.isna(x) else json.loads(x))
)
return df.to_dict("records")


def main(args: argparse.Namespace):
"""Extract and save a cell summary from annotated data.
Args:
args (argparse.Namespace):
CLI arguments, must contain "matrix", "annotation_method",
"cell_id_column", "cell_label_column", "gene_expr_column",
"cell_source, "jsonld_context", "output", and
"annotations_output"
"""
unique_rows = get_unique_rows_with_counts(args.matrix, args.cell_id_column)
summary_rows = unique_rows_to_summary_rows(
unique_rows, args.cell_id_column, args.cell_label_column, args.gene_expr_column
Expand Down
Loading

0 comments on commit 160d3d8

Please sign in to comment.