-
Notifications
You must be signed in to change notification settings - Fork 0
/
6_count_kinase_type.py
executable file
·65 lines (51 loc) · 1.87 KB
/
6_count_kinase_type.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python3
##########################################################################
#
# Peter Man-Un Ung @ MSSM
#
# v1.0 17.10.07 count how many kinase in each conformational state
#
# Read in the classification results after R machine learning and parse
# to count how many unique kinase (uniprot_id) in each conformation state
#
# format: gene_name, species, unip_id, conf_num, conf, pdb_num, pdb_list
#
##########################################################################
import pandas
import sys, os, re
msg = ''' '''
if len(sys.argv) != 2: sys.exit(msg)
##########################################################################
## read database(.csv) into dataframe
Data = pandas.read_csv(sys.argv[1], sep=',', header='infer', )
Result = {}
group = {}
for idx, row in Data.iterrows():
group = {}
if len(str(row['gene']).split()) > 1:
line = re.sub(r',', '', str(row['gene']))
gene = line.split()[0]
else:
gene = str(row['gene'])
species = re.sub(r' ', '_', str(row['species']))
if row['uni_id'] in Result:
if row['group'] in Result[row['uni_id']]:
Result[row['uni_id']][row['group']][0].append(row['pdb_id'])
else:
Result[row['uni_id']][row['group']] = [[row['pdb_id']], gene, species]
if not re.search(r'nan|NaN|NAN', gene):
for idx, group in enumerate(Result[row['uni_id']]):
Result[row['uni_id']][group][1] = gene
else:
group[row['group']] = [[row['pdb_id']], gene, species]
Result[row['uni_id']] = group
for unip in Result:
grp_num = len(Result[unip])
for group in Result[unip]:
pdb_num = len(Result[unip][group][0])
pdb_lst = ' '.join( Result[unip][group][0] )
gene = Result[unip][group][1]
spec = Result[unip][group][2]
if grp_num > 0:
print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}'.format(
gene, spec, unip, grp_num, group, pdb_num, pdb_lst ))