-
Notifications
You must be signed in to change notification settings - Fork 9
/
vgnc_benchmark.py
executable file
·156 lines (128 loc) · 6.75 KB
/
vgnc_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
import collections
import itertools
import json
import logging
import math
import os
import sqlite3
from JSON_templates import write_assessment_dataset
from helpers import auto_open
logger = logging.getLogger("VGNC-Benchmark")
Protein = collections.namedtuple("Protein", ["Acc", "Species", "VGNC_ID"])
def compute_vgnc_benchmark(vgnc_orthologs, db_path, raw_out):
def get_prot_data_for(proteins):
vgnc_fam = {}
for (p1, p2), fam in vgnc_orthologs.items():
vgnc_fam[p1] = fam
vgnc_fam[p2] = fam
cur = con.cursor()
placeholder = ",".join('?' * len(proteins))
query = f"SELECT * FROM proteomes WHERE prot_nr IN ({placeholder})"
cur.execute(query, proteins)
data = {}
for row in cur.fetchall():
data[row[0]] = Protein(row[1], row[2], vgnc_fam[row[0]])
return data
def get_orthologs_among_subset_of_proteins(proteins):
query = f"SELECT * FROM orthologs WHERE prot_nr1 == ? AND prot_nr1 < prot_nr2"
prot_set = set(proteins)
def get_orthologs_for_query(query_protein):
cur = con.cursor()
cur.execute(query, (query_protein,))
return [rel for rel in cur.fetchall() if rel[1] in prot_set]
for p in prot_set:
yield from get_orthologs_for_query(p)
def write_raw_rels(out, rels, typ):
for en1, en2 in rels:
out.write(
f"{protein_infos[en1].Acc}\t{protein_infos[en2].Acc}\t{typ}\t"
f"{protein_infos[en1].VGNC_ID}\t{protein_infos[en2].VGNC_ID}\t"
f"{protein_infos[en1].Species}\t{protein_infos[en2].Species}\n")
con = sqlite3.connect(db_path)
nr_true = len(vgnc_orthologs)
missing_true_orthologs = set(vgnc_orthologs.keys())
logger.info(f"VGNC asserts {nr_true} orthologous relations")
vgnc_genes = list(set(itertools.chain.from_iterable(vgnc_orthologs.keys())))
protein_infos = get_prot_data_for(vgnc_genes)
all_predicted_orthologs_among_vgnc_genes = set(get_orthologs_among_subset_of_proteins(vgnc_genes))
logger.info(f"Method predicted {len(all_predicted_orthologs_among_vgnc_genes)} among the set of "
f"{len(vgnc_genes)} genes in the VGNC dataset")
missing_true_orthologs -= set((x[0], x[1]) for x in all_predicted_orthologs_among_vgnc_genes)
logger.info(f"Method didn't predict {len(missing_true_orthologs)} asserted orthologs")
true_positives = set(vgnc_orthologs.keys()) - missing_true_orthologs
tp = len(vgnc_orthologs) - len(missing_true_orthologs)
tpr = tp / nr_true
vgnc_per_species = collections.defaultdict(set)
for info in protein_infos.values():
vgnc_per_species[info.Species].add(info.VGNC_ID)
false_positives = []
for p1, p2 in all_predicted_orthologs_among_vgnc_genes:
if protein_infos[p1].VGNC_ID != protein_infos[p2].VGNC_ID:
id1_in_sp2 = protein_infos[p1].VGNC_ID in vgnc_per_species[protein_infos[p2].Species]
id2_in_sp1 = protein_infos[p2].VGNC_ID in vgnc_per_species[protein_infos[p1].Species]
is_fp = id1_in_sp2 and id2_in_sp1
logger.debug(
f"is used as fp: {is_fp:1} -- "
f"{protein_infos[p1].VGNC_ID} ({protein_infos[p1].Acc}) vs {protein_infos[p2].VGNC_ID} ({protein_infos[p2].Acc}): "
f"{protein_infos[p1].VGNC_ID} in {protein_infos[p2].Species}: {id1_in_sp2}; "
f"{protein_infos[p2].VGNC_ID} in {protein_infos[p1].Species}: {id2_in_sp1}")
if is_fp:
false_positives.append((p1, p2))
nr_pos = tp + len(false_positives)
ppv = tp / nr_pos
write_raw_rels(raw_out, true_positives, "TP")
write_raw_rels(raw_out, false_positives, "FP")
write_raw_rels(raw_out, missing_true_orthologs, "FN")
logger.info(f"TPR: {tpr}; nr_true: {nr_true}")
logger.info(f"PPV: {ppv}; nr_pos: {nr_pos}")
metrics = [{"name": "TPR", "value": tpr, "stderr": 1.96 * math.sqrt(tpr * (1 - tpr) / nr_true)},
{"name": "PPV", "value": ppv, "stderr": 1.96 * math.sqrt(ppv * (1 - ppv) / nr_pos)},
]
return metrics
def get_vgnc_orthologs(vgnc_orthologs_fname):
orthologs = {}
with auto_open(vgnc_orthologs_fname, 'rt') as fh:
for line in fh:
a, b, fam = line.strip().split("\t")
a = int(a)
b = int(b)
orthologs[(min(a, b), max(a, b))] = fam
return orthologs
def write_assessment_json_stub(fn, community, participant, result):
challenge = "VGNC"
stubs = []
for metric in result:
id_ = "{}:{}_{}_{}_A".format(community, challenge, metric['name'],
participant.replace(' ', '-').replace('_', '-'))
stubs.append(write_assessment_dataset(id_, community, challenge, participant, metric['name'], metric['value'],
metric.get('stderr', 0)))
with auto_open(fn, 'wt') as fout:
json.dump(stubs, fout, sort_keys=True, indent=4, separators=(',', ': '))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="VGNC asserted orthologs benchmark")
parser.add_argument('--assessment-out', required=True, help="Path where assessment json file will be stored")
parser.add_argument('--outdir', required=True, help="Folder to store the raw output file in")
parser.add_argument('--db', required=True, help="Path to sqlite database with pairwise predictions")
parser.add_argument('--com', required=True, help="community id")
parser.add_argument('--vgnc-orthologs', required=True, help="Path to text file with VGNC asserted orthologs")
parser.add_argument('--participant', required=True, help="Name of participant method")
parser.add_argument('--log', help="Path to log file. Defaults to stderr")
parser.add_argument('-d', '--debug', action="store_true", help="Set logging to debug level")
conf = parser.parse_args()
log_conf = {'level': logging.INFO, 'format': "%(asctime)-15s %(levelname)-7s: %(message)s"}
if conf.log is not None:
log_conf['filename'] = conf.log
if conf.debug:
log_conf['level'] = logging.DEBUG
logging.basicConfig(**log_conf)
logger.info("running vgnc_benchmark with following arguments: {}".format(conf))
os.makedirs(conf.outdir, exist_ok=True)
outfn_path = os.path.join(conf.outdir,
"VGNC_{}_raw.txt.gz".format(conf.participant.replace(' ', '-').replace('_', '-'))
)
vgnc_orthologs = get_vgnc_orthologs(conf.vgnc_orthologs)
with auto_open(outfn_path, 'wt') as raw_out_fh:
res = compute_vgnc_benchmark(vgnc_orthologs, conf.db, raw_out_fh)
write_assessment_json_stub(conf.assessment_out, conf.com, conf.participant, res)