Code documentation

hubmapconsortium · Dec 13, 2023 · 160d3d8 · 160d3d8
1 parent 93d85ee
commit 160d3d8
Show file tree

Hide file tree

Showing 11 changed files with 519 additions and 36 deletions.
diff --git a/containers/azimuth/context/main.py b/containers/azimuth/context/main.py
@@ -20,6 +20,7 @@ def __init__(self):
  super().__init__(OrganLookup)
 
  def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
+ """Annotate data using azimuth."""
  data = anndata.read_h5ad(matrix)
  reference_data = self.find_reference_data(organ, options["reference_data_dir"])
  annotation_level = self.find_annotation_level(
@@ -42,23 +43,59 @@ def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
 
  return data, annotation_level
 
- def create_clean_matrix(self, matrix: anndata.AnnData):
+ def create_clean_matrix(self, matrix: anndata.AnnData) -> anndata.AnnData:
+ """Creates a copy of the data with all observation columns removed.
+
+ Args:
+ matrix (anndata.AnnData): Original data
+
+ Returns:
+ anndata.AnnData: Cleaned data
+ """
  clean_obs = pandas.DataFrame(index=matrix.obs.index)
  clean_matrix = matrix.copy()
  clean_matrix.obs = clean_obs
  return clean_matrix
 
  def copy_annotations(
  self, matrix: anndata.AnnData, annotated_matrix: anndata.AnnData
- ):
+ ) -> None:
+ """Copies annotations from one matrix to another.
+
+ Args:
+ matrix (anndata.AnnData): Matrix to copy to
+ annotated_matrix (anndata.AnnData): Matrix to copy from
+ """
  matrix.obs = matrix.obs.join(annotated_matrix.obs, rsuffix="_azimuth")
 
- def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path):
+ def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str:
+ """Creates a subprocess running the Azimuth annotation R script.
+
+ Args:
+ matrix_path (Path): Path to data file
+ reference_data (Path): Path to organ reference data directory
+
+ Returns:
+ str: Path to the output data file
+ """
  script_command = ["Rscript", "/run_azimuth.R", matrix_path, reference_data]
  subprocess.run(script_command, capture_output=True, check=True, text=True)
  return "./result.h5ad"
 
- def find_reference_data(self, organ: str, dir: Path):
+ def find_reference_data(self, organ: str, dir: Path) -> Path:
+ """Finds the reference data directory for an organ.
+
+ Args:
+ organ (str): Organ name
+ dir (Path): Directory to search
+
+ Raises:
+ ValueError: If no reference data could be found
+
+ Returns:
+ Path: The data directory
+ """
+
  def is_reference_data_candidate(path: Path):
  return path.is_dir() and organ.lower() in path.name.lower()
 
@@ -71,14 +108,39 @@ def is_reference_data_candidate(path: Path):
  # idx.annoy and ref.Rds is always located inside an 'azimuth' subdirectory
  return subdir / "azimuth"
 
- def find_annotation_level(self, organ: str, path: Path):
+ def find_annotation_level(self, organ: str, path: Path) -> str:
+ """Finds the column name which contains the predictions.
+
+ Args:
+ organ (str): Organ name
+ path (Path): Path to file containing information about column names
+
+ Returns:
+ str: Column name
+ """
  with open(path) as file:
  levels_by_organ = json.load(file)
  return "predicted." + levels_by_organ[organ]
 
  def _find_in_dir(
  self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str
- ):
+ ) -> Path:
+ """Search a directory for a entry which passes the provided test.
+
+ Args:
+ dir (Path): Directory to search
+ cond (t.Callable[[Path], bool]): Test used to match sub entries
+ error_msg (str): Error message used when no entries match
+ warn_msg (str): Warning message use when multiple entries match
+
+ Raises:
+ ValueError: If there are no matching sub entries
+
+ Returns:
+ Path:
+ The matching entry.
+ If multiple entries match the one with the shortest name is returned.
+ """
  candidates = list(filter(cond, dir.iterdir()))
  candidates.sort(key=lambda path: len(path.name))
 

diff --git a/containers/celltypist/context/main.py b/containers/celltypist/context/main.py
@@ -19,10 +19,12 @@ def __init__(self, mapping_file: Path):
  super().__init__(mapping_file)
 
  def get_builtin_options(self):
+ """Get builtin celltypist models."""
  models = celltypist.models.get_all_models()
  return map(lambda model: (model, self.from_raw(model)), models)
 
  def from_raw(self, id: str):
+ """Load a celltypist model."""
  return celltypist.models.Model.load(id)
 
 
@@ -31,6 +33,7 @@ def __init__(self):
  super().__init__(CelltypistOrganLookup, "predicted_labels")
 
  def do_run(self, matrix: Path, organ: celltypist.Model, options: CelltypistOptions):
+ """Annotate data using celltypist."""
  data = scanpy.read_h5ad(matrix)
  data = self.normalize(data)
  data, var_names = self.normalize_var_names(data, options)
@@ -39,6 +42,17 @@ def do_run(self, matrix: Path, organ: celltypist.Model, options: CelltypistOptio
  return data
 
  def normalize(self, data: scanpy.AnnData) -> scanpy.AnnData:
+ """Normalizes data according to celltypist requirements.
+
+ Celltypist requires data to be log1p normalized with 10,000 counts per cell.
+ See https://github.com/Teichlab/celltypist for details.
+
+ Args:
+ data (scanpy.AnnData): Original data to be normalized
+
+ Returns:
+ scanpy.AnnData: Normalized data
+ """
  primary_column = "feature_name"
  alternative_primary_column = "gene_symbol"
  if primary_column not in data.var.columns:
@@ -58,6 +72,15 @@ def normalize(self, data: scanpy.AnnData) -> scanpy.AnnData:
  def normalize_var_names(
  self, data: scanpy.AnnData, options: CelltypistOptions
  ) -> t.Tuple[scanpy.AnnData, pandas.Index]:
+ """Normalizes variable names, replacing ensemble ids with the corresponding gene name.
+
+ Args:
+ data (scanpy.AnnData): Data with potentially non-normalized names
+ options (CelltypistOptions): Options containing the ensemble id mapping file path
+
+ Returns:
+ t.Tuple[scanpy.AnnData, pandas.Index]: The normalized data along with the original names
+ """
  lookup = self.load_ensemble_lookup(options)
  names = data.var_names
 
@@ -68,7 +91,15 @@ def getNewName(name: str):
  data.var_names = t.cast(t.Any, names.map(getNewName))
  return data, names
 
- def load_ensemble_lookup(self, options: CelltypistOptions):
+ def load_ensemble_lookup(self, options: CelltypistOptions) -> t.Dict[str, str]:
+ """Load a file mapping ensemble id to gene names.
+
+ Args:
+ options (CelltypistOptions): Options with the mapping file path
+
+ Returns:
+ t.Dict[str, str]: Loaded mapping
+ """
  with open(options["ensemble_lookup"]) as file:
  reader = csv.DictReader(file)
  lookup: t.Dict[str, str] = {}

diff --git a/containers/crosswalking/context/main.py b/containers/crosswalking/context/main.py
@@ -7,12 +7,28 @@
 
 
 def filter_crosswalk_table(table: pd.DataFrame, *columns: str) -> pd.DataFrame:
- """Filters the table to remove empty rows and keep only necessary columns"""
+ """Filter the crosswalk table to only include specified columns.
+
+ Also removes empty rows and cast values to string.
+
+ Args:
+ table (pd.DataFrame): Original full crosswalk table
+
+ Returns:
+ pd.DataFrame: Filtered table
+ """
  return table[list(columns)].dropna().astype(str).drop_duplicates()
 
 
-def generate_iri(label: str):
- """generate IRIs for labels not found in crosswalk tables"""
+def generate_iri(label: str) -> str:
+ """Create a temporary IRI based on a label.
+
+ Args:
+ label (str): Label for the row
+
+ Returns:
+ str: Temporary IRI
+ """
  suffix = label.lower().strip()
  suffix = re.sub(r"\W+", "-", suffix)
  suffix = re.sub(r"[^a-z0-9-]+", "", suffix)
@@ -29,7 +45,21 @@ def crosswalk(
  table_clid_column: str,
  table_match_column: str,
 ) -> anndata.AnnData:
- """Gives each cell a CL ID and Match type using crosswalk table"""
+ """Crosswalks the data adding CLIDs and match types using a crosswalk table.
+
+ Args:
+ matrix (anndata.AnnData): Data to crosswalk
+ data_label_column (str): Column used to match against the table
+ data_clid_column (str): Column to store CLIDs in
+ data_match_column (str): Column to store match type in
+ table (pd.DataFrame): Crosswalk table
+ table_label_column (str): Column used to match against the data
+ table_clid_column (str): Column storing CLIDs
+ table_match_column (str): Column storing match type
+
+ Returns:
+ anndata.AnnData: Crosswalked data with CLIDs and match type added
+ """
  column_map = {
  table_clid_column: data_clid_column,
  table_match_column: data_match_column,
@@ -54,16 +84,37 @@ def crosswalk(
  return result
 
 
-def _set_default_clid(obs: pd.DataFrame, clid_column: str, label_column: str):
+def _set_default_clid(obs: pd.DataFrame, clid_column: str, label_column: str) -> None:
+ """Adds default CLIDs to rows that did not match against the crosswalk table.
+
+ Args:
+ obs (pd.DataFrame): Data rows
+ clid_column (str): Column to check and update with default CLIDs
+ label_column (str): Column used when generating default CLIDs
+ """
  defaults = obs.apply(lambda row: generate_iri(row[label_column]), axis=1)
  obs.loc[obs[clid_column].isna(), clid_column] = defaults
 
 
-def _set_default_match(obs: pd.DataFrame, column: str):
+def _set_default_match(obs: pd.DataFrame, column: str) -> None:
+ """Adds default match type to rows that did not match against the crosswalk table.
+
+ Args:
+ obs (pd.DataFrame): Data rows
+ column (str): Column to check and update with default match type
+ """
  obs.loc[obs[column].isna(), column] = "skos:exactMatch"
 
 
 def _get_empty_table(args: argparse.Namespace) -> pd.DataFrame:
+ """Creates an empty crosswalk table.
+
+ Args:
+ args (argparse.Namespace): Same arguments as provided to `main`
+
+ Returns:
+ pd.DataFrame: An empty table
+ """
  return pd.DataFrame(
  columns=[
  args.crosswalk_table_label_column,
@@ -74,12 +125,24 @@ def _get_empty_table(args: argparse.Namespace) -> pd.DataFrame:
 
 
 def main(args: argparse.Namespace):
+ """Crosswalks a h5ad file and saves the result to another h5ad file.
+
+ Args:
+ args (argparse.Namespace):
+ CLI arguments, must contain "matrix",
+ "annotation_column", "clid_column", "match_column",
+ "crosswalk_table", "crosswalk_table_label_column",
+ "crosswalk_table_clid_column", "crosswalk_table_match_column", and
+ "output_matrix"
+ """
  matrix = crosswalk(
  args.matrix,
  args.annotation_column,
  args.clid_column,
  args.match_column,
- args.crosswalk_table if args.crosswalk_table is not None else _get_empty_table(args),
+ args.crosswalk_table
+ if args.crosswalk_table is not None
+ else _get_empty_table(args),
  args.crosswalk_table_label_column,
  args.crosswalk_table_clid_column,
  args.crosswalk_table_match_column,

diff --git a/containers/extract-summary/context/main.py b/containers/extract-summary/context/main.py
@@ -1,5 +1,6 @@
 import argparse
 import json
+import typing as t
 
 import anndata
 import pandas as pd
@@ -8,6 +9,15 @@
 def get_unique_rows_with_counts(
  matrix: anndata.AnnData, clid_column: str
 ) -> pd.DataFrame:
+ """Computes unique CLIDs and the total count for each.
+
+ Args:
+ matrix (anndata.AnnData): Data
+ clid_column (str): Column with CLIDs
+
+ Returns:
+ pd.DataFrame: A frame with unique CLIDs and counts added
+ """
  counts = matrix.obs.value_counts(clid_column).reset_index()
  counts.columns = [clid_column, "count"]
  obs_with_counts = matrix.obs.merge(counts, how="left")
@@ -20,7 +30,19 @@ def unique_rows_to_summary_rows(
  label_column: str,
  gene_expr_column: str,
  counts_column="count",
-):
+) -> t.List[dict]:
+ """Converts a data frame with unique CLIDs rows into cell summary rows.
+
+ Args:
+ unique (pd.DataFrame): Data with unique CLIDs
+ clid_column (str): Column with CLIDs
+ label_column (str): Column with labels
+ gene_expr_column (str): Column with gene expressions
+ counts_column (str, optional): Column with the total counts. Defaults to "count".
+
+ Returns:
+ t.List[dict]: A cell summary for each row in the source data
+ """
  columns = [clid_column, label_column, gene_expr_column, counts_column]
  df = unique[columns].rename(
  columns={
@@ -33,11 +55,24 @@ def unique_rows_to_summary_rows(
 
  df["@type"] = "CellSummaryRow"
  df["percentage"] = df["count"] / df["count"].sum()
- df["gene_expr"] = df["gene_expr"].astype(object).apply(lambda x: [] if pd.isna(x) else json.loads(x))
+ df["gene_expr"] = (
+ df["gene_expr"]
+ .astype(object)
+ .apply(lambda x: [] if pd.isna(x) else json.loads(x))
+ )
  return df.to_dict("records")
 
 
 def main(args: argparse.Namespace):
+ """Extract and save a cell summary from annotated data.
+
+ Args:
+ args (argparse.Namespace):
+ CLI arguments, must contain "matrix", "annotation_method",
+ "cell_id_column", "cell_label_column", "gene_expr_column",
+ "cell_source, "jsonld_context", "output", and
+ "annotations_output"
+ """
  unique_rows = get_unique_rows_with_counts(args.matrix, args.cell_id_column)
  summary_rows = unique_rows_to_summary_rows(
  unique_rows, args.cell_id_column, args.cell_label_column, args.gene_expr_column