Skip to content

Commit

Permalink
Curated canonical isoforms with appris
Browse files Browse the repository at this point in the history
  • Loading branch information
Jfortin1 committed Sep 22, 2022
1 parent debd2dd commit 9bd8e46
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 2 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Description: Provides precomputed data for the crisprVerse ecosystem.
(TSSs), are provided for both mouse and human. Genomic coordinates
of repeat elements are also provided. All objects are expressed in
GRCh38 (human) and GRCm38 (mouse) coordinates.
Version: 0.99.22
Version: 0.99.23
Authors@R: c(
person("Jean-Philippe", "Fortin", email = "[email protected]", role = c("aut", "cre")),
person("Luke", "Hoberecht", email = "[email protected]", role = c("aut"))
Expand Down
Binary file modified data/canonicalHuman.rda
Binary file not shown.
Binary file modified data/canonicalMouse.rda
Binary file not shown.
81 changes: 80 additions & 1 deletion inst/generateTxDbData.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,88 @@ canonicalMouse <- getCanonicalTranscripts(ids,
organism="mmusculus")



cleanCanonical <- function(canonicalTxs,
appris,
txObject
){
cols <- c("gene_id", "tx_id", "gene_symbol")
tx2Gene <- mcols(txObject[["cds"]])[, cols, drop = FALSE]
tx2Gene <- as.data.frame(tx2Gene)
tx2Gene <- tx2Gene[!duplicated(tx2Gene), ]
rownames(tx2Gene) <- NULL

# Only keeping transcripts that exist in current annotation:
good <- canonicalTxs$tx_id %in% tx2Gene$tx_id
canonicalTxs <- canonicalTxs[good,]

# Let's add MANE select transcripts:
missing <- setdiff(tx2Gene$gene_id, canonicalTxs$gene_id)
appris <- appris[appris$gene_id %in% missing,]
mane <- appris[appris$mane_select,]
stopifnot(!any(duplicated(mane$gene_id)))
mane <- mane[, c("gene_id", "tx_id")]
canonicalTxs <- rbind(canonicalTxs, mane)

# Let's add Appris principal:
missing <- setdiff(tx2Gene$gene_id, canonicalTxs$gene_id)
appris <- appris[appris$gene_id %in% missing,]
principal <- appris[appris$appris_label=="PRINCIPAL",]
principal <- principal[order(principal$gene_id, principal$appris_number),]
missing <- intersect(missing, principal$gene_id)
wh <- match(missing, principal$gene_id)
principal <- principal[wh,]
principal <- principal[, c("gene_id", "tx_id")]
canonicalTxs <- rbind(canonicalTxs, principal)

# Let's add Appris alternative:
missing <- setdiff(tx2Gene$gene_id, canonicalTxs$gene_id)
appris <- appris[appris$gene_id %in% missing,]
missing <- intersect(missing, appris$gene_id)
wh <- match(missing, appris$gene_id)
appris <- appris[wh,]
appris <- appris[, c("gene_id", "tx_id")]
canonicalTxs <- rbind(canonicalTxs, appris)

# Are there any still missing?
# We will select the longest isoform for missing genes:
missing <- setdiff(tx2Gene$gene_id, canonicalTxs$gene_id)
tx2Gene <- tx2Gene[tx2Gene$gene_id %in% missing,]
cds <- txObject[["cds"]]
cds <- cds[cds$gene_id %in% missing]
dfs <- split(cds, f=cds$tx_id)
ns <- vapply(dfs, function(x) sum(BiocGenerics::width(x)), FUN.VALUE=1)
wh <- match(tx2Gene$tx_id, names(ns))
tx2Gene$len <- ns[wh]
dfs <- split(tx2Gene, f=tx2Gene$gene_id)
dfs <- lapply(dfs, function(df){
df <- df[order(-df$len),,drop=FALSE]
df[1,,drop=FALSE]
})
df <- do.call(rbind, dfs)
df <- df[, c("gene_id", "tx_id")]
canonicalTxs <- rbind(canonicalTxs, df)
return(canonicalTxs)
}

# Going to clean the canonical transcripts for current release:
#load("../data/canonicalHuman.rda")
load("../data/txdb_human.rda")
load("../data/apprisHuman.rda")
canonicalHuman <- cleanCanonical(canonicalHuman,
appris=apprisHuman,
txObject=txdb_human)
#load("../data/canonicalMouse.rda")
load("../data/txdb_mouse.rda")
load("../data/apprisMouse.rda")
canonicalMouse <- cleanCanonical(canonicalMouse,
appris=apprisMouse,
txObject=txdb_mouse)



use_data(canonicalHuman, compress="xz", overwrite=TRUE)
use_data(canonicalMouse, compress="xz", overwrite=TRUE)




0 comments on commit 9bd8e46

Please sign in to comment.