Curated canonical isoforms with appris

crisprVerse · Sep 22, 2022 · 9bd8e46 · 9bd8e46
1 parent debd2dd
commit 9bd8e46
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,7 +5,7 @@ Description: Provides precomputed data for the crisprVerse ecosystem.
  (TSSs), are provided for both mouse and human. Genomic coordinates
  of repeat elements are also provided. All objects are expressed in 
  GRCh38 (human) and GRCm38 (mouse) coordinates. 
-Version: 0.99.22
+Version: 0.99.23
 Authors@R: c(
  person("Jean-Philippe", "Fortin", email = "[email protected]", role = c("aut", "cre")),
  person("Luke", "Hoberecht", email = "[email protected]", role = c("aut"))

diff --git a/data/canonicalHuman.rda b/data/canonicalHuman.rda
diff --git a/data/canonicalMouse.rda b/data/canonicalMouse.rda
diff --git a/inst/generateTxDbData.R b/inst/generateTxDbData.R
@@ -71,9 +71,88 @@ canonicalMouse <- getCanonicalTranscripts(ids,
  organism="mmusculus")
 
 
+
+cleanCanonical <- function(canonicalTxs,
+ appris,
+ txObject
+){
+ cols <- c("gene_id", "tx_id", "gene_symbol")
+ tx2Gene <- mcols(txObject[["cds"]])[, cols, drop = FALSE]
+ tx2Gene <- as.data.frame(tx2Gene)
+ tx2Gene <- tx2Gene[!duplicated(tx2Gene), ]
+ rownames(tx2Gene) <- NULL
+
+ # Only keeping transcripts that exist in current annotation:
+ good <- canonicalTxs$tx_id %in% tx2Gene$tx_id
+ canonicalTxs <- canonicalTxs[good,]
+
+ # Let's add MANE select transcripts:
+ missing <- setdiff(tx2Gene$gene_id, canonicalTxs$gene_id)
+ appris <- appris[appris$gene_id %in% missing,]
+ mane <- appris[appris$mane_select,]
+ stopifnot(!any(duplicated(mane$gene_id)))
+ mane <- mane[, c("gene_id", "tx_id")]
+ canonicalTxs <- rbind(canonicalTxs, mane)
+
+ # Let's add Appris principal:
+ missing <- setdiff(tx2Gene$gene_id, canonicalTxs$gene_id)
+ appris <- appris[appris$gene_id %in% missing,]
+ principal <- appris[appris$appris_label=="PRINCIPAL",]
+ principal <- principal[order(principal$gene_id, principal$appris_number),]
+ missing <- intersect(missing, principal$gene_id)
+ wh <- match(missing, principal$gene_id)
+ principal <- principal[wh,]
+ principal <- principal[, c("gene_id", "tx_id")]
+ canonicalTxs <- rbind(canonicalTxs, principal)
+
+ # Let's add Appris alternative:
+ missing <- setdiff(tx2Gene$gene_id, canonicalTxs$gene_id)
+ appris <- appris[appris$gene_id %in% missing,]
+ missing <- intersect(missing, appris$gene_id)
+ wh <- match(missing, appris$gene_id)
+ appris <- appris[wh,]
+ appris <- appris[, c("gene_id", "tx_id")]
+ canonicalTxs <- rbind(canonicalTxs, appris)
+
+ # Are there any still missing?
+ # We will select the longest isoform for missing genes:
+ missing <- setdiff(tx2Gene$gene_id, canonicalTxs$gene_id)
+ tx2Gene <- tx2Gene[tx2Gene$gene_id %in% missing,]
+ cds <- txObject[["cds"]]
+ cds <- cds[cds$gene_id %in% missing]
+ dfs <- split(cds, f=cds$tx_id)
+ ns <- vapply(dfs, function(x) sum(BiocGenerics::width(x)), FUN.VALUE=1)
+ wh <- match(tx2Gene$tx_id, names(ns))
+ tx2Gene$len <- ns[wh]
+ dfs <- split(tx2Gene, f=tx2Gene$gene_id)
+ dfs <- lapply(dfs, function(df){
+ df <- df[order(-df$len),,drop=FALSE]
+ df[1,,drop=FALSE]
+ })
+ df <- do.call(rbind, dfs)
+ df <- df[, c("gene_id", "tx_id")]
+ canonicalTxs <- rbind(canonicalTxs, df)
+ return(canonicalTxs)
+}
+
+# Going to clean the canonical transcripts for current release:
+#load("../data/canonicalHuman.rda")
+load("../data/txdb_human.rda")
+load("../data/apprisHuman.rda")
+canonicalHuman <- cleanCanonical(canonicalHuman,
+ appris=apprisHuman,
+ txObject=txdb_human)
+#load("../data/canonicalMouse.rda")
+load("../data/txdb_mouse.rda")
+load("../data/apprisMouse.rda")
+canonicalMouse <- cleanCanonical(canonicalMouse,
+ appris=apprisMouse,
+ txObject=txdb_mouse)
+
+
+
 use_data(canonicalHuman, compress="xz", overwrite=TRUE)
 use_data(canonicalMouse, compress="xz", overwrite=TRUE)
 
 
 
-