-
Notifications
You must be signed in to change notification settings - Fork 0
/
x_search_align.py
executable file
·257 lines (206 loc) · 10.7 KB
/
x_search_align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/env python3
import sys,os,re
import pandas as pd
from Bio import SeqIO
from Bio.PDB import PDBIO
from Bio.SeqRecord import SeqRecord
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
p = PDBParser(PERMISSIVE=1, QUIET=True)
##########################################################################
# Get FASTA from PDB structures, find the top10 most similar sequence, align
class GenerateProfileAlignment( object ):
def __init__( self, hom_dir=None, tmp_dir=None, rst_dir=None,
ref_pdb=None, f_nogap=None, f_dict=None ):
self.tmp_dir = tmp_dir
self.hom_dir = hom_dir
self.rst_dir = rst_dir
self.ref_pdb = ref_pdb
self.f_nogap = f_nogap
self.f_dict = f_dict
def __call__( self, pdb ):
return self.ProfileAlignment( pdb )
def ProfileAlignment( self, pdb ):
# work on the target PDB
pdb_id = pdb.split('/')[-1].split('.')[0]
fasta = FASTA_Gen( pdb, pdb_id )
with open('_TEMP.tget.{0}.fasta'.format(pdb_id), 'w') as fo:
SeqIO.write(fasta, fo, 'fasta')
Identity = BlastpPairwiseIdentity( self.tmp_dir,
'_TEMP.tget.{0}.fasta'.format(pdb_id), self.f_nogap )
# Handle input that may not be kinase (very low identity) and not include
# them in the next round of process
if Identity is None:
print('\n \033[31m#2# Alignment Warning: No match to any kinase: \033[31m{0}\033[0m'.format(pdb_id))
return None
elif Identity[0][2] < 30.0:
print('\n \033[31m#2# Alignment Warning:\033[0m Seq identity \033[31m< 30%\033[0m to any kinase {0} x {1}: {2:4.1%}%'.format(pdb_id, Identity[0][0], Identity[0][2]))
return None
else:
# Write out top most similar sequence to the input sequence for alignment
# Result appears to be more consistent and better than using MSA
with open('_TEMP.prof.{0}.fasta'.format(pdb_id), 'w') as fo:
SeqIO.write(self.f_dict[self.ref_pdb], fo, 'fasta')
for item in Identity[0:1]:
# print('# Most similar Kinase for {0}: {1} - {2}'.format(
# pdb_id, item[0], item[1]) )
SeqIO.write(self.f_dict[item[0]], fo, 'fasta')
# use Muscle to align target seq to Profile seq, extract the aligned target seq
MuscleProfileAlignment(
'_TEMP.prof.{0}.fasta'.format(pdb_id),
'_TEMP.tget.{0}.fasta'.format(pdb_id),
'_TEMP.comb.{0}.fasta'.format(pdb_id) )
Data = CacheSeqDatabase('_TEMP.comb.{0}.fasta'.format(pdb_id))
os.system('rm _TEMP.prof.{0}.fasta _TEMP.tget.{0}.fasta _TEMP.comb.{0}.fasta'.format(pdb_id))
return Data[pdb_id]
##########################################################################
# Use Blastp to generate pairwise percent identity between a query sequence
# and a database of sequences. Do not generate a matrix of pairwise identity
# to save on time.
# Calculate sequence identity and similarity of a query seq to a library of
# sequence (or single seq) and output a list with the best one at the 1st row.
# Only takes in ungapped sequences.
def BlastpPairwiseIdentity( rst_dir, mdl_prot_fasta, fasta_database ):
# If input Fasta is a file, reconfigure to only the fasta name
if os.path.isfile(mdl_prot_fasta):
fasta_name = mdl_prot_fasta.split('.fasta')[0]
else:
fasta_name = mdl_prot_fasta
# print('\n \033[34m** Calculate Sequence Identity between Query and Database Sequences **\033[0m')
# print(' Query Fasta: {0}.fasta'.format(fasta_name))
# print(' Fasta Database: '+fasta_database)
# blastp to output: Name, AA_length, percent identity, percent positive
# result in .csv format, omit other irrelevant data
os.system('blastp -query "{0}.fasta" -subject "{1}" -max_target_seqs 5000 -out "{2}/{3}.idmat.txt" -outfmt "6 sseqid length pident ppos"'.format(fasta_name, fasta_database, rst_dir, fasta_name.split('/')[-1]))
# Parse percent identity result generated by BlastP. Did not use clustalo or
# t_coffee because they do redundent pairwise identity calculation for other
# kinases to create a true matrix and that is not needed; only need 1 set of
# pairwise identity between query sequence and the database sequences.
# Sometimes non-kinase sequence will fail in Blastp (ASCT2 2nww.pdb). Return 0
if not os.path.isfile('{0}/{1}.idmat.txt'.format(
rst_dir, fasta_name.split('/')[-1])):
print('\n \033[31m#2# Alignment Warning:\033[0m Cannot find Blastp output. Seq identity to kinase too low? '+fasta_name)
return None
elif os.stat('{0}/{1}.idmat.txt'.format(
rst_dir, fasta_name.split('/')[-1])).st_size == 0:
print('\n \033[31m#2# Alignment Warning:\033[0m Blastp failed. Seq Identity to kinase too low? '+fasta_name)
return None
## Extract the identity information from Blastp result; sometimes a chain is
## broken into fragments, need to combine them according to residue ratios
Ident = {}
with open('{0}/{1}.idmat.txt'.format(
rst_dir, fasta_name.split('/')[-1]), 'r') as fi:
for line in fi:
Items = line.split('\t')
name, aa, identity, positive = (Items[0].split('/')[0].split('|')[0],
int(Items[1]), float(Items[2]), float(Items[3]) )
if name in Ident:
Ident[name].append( [name, aa, identity, positive] )
else:
Ident[name] = [ [name, aa, identity, positive] ]
# Convert dictionary into Tulip data. If a Fasta name has multiple lines,
# the alignment/identity calculation is broken down into pieces for 1 seq.
# Summarize the pieces into 1 by adding up the ratio
Data = []
for name in Ident:
length = sum(list(zip(*Ident[name]))[1]) # rearrange tulip groups
x, y = 0.0, 0.0
for row in Ident[name]:
x += row[1] * row[2]
y += row[1] * row[3]
Data.append( [name, length, (x/length), (y/length)] )
############################
# sort the dataset by percent identity or positive, then by available length
pdata = pd.DataFrame(Data)
pdata.columns = ['Kinase', 'Length', 'Identity', 'Similarity']
# sort selection: percent identity then by length
pdata_temp = pdata.sort_values( by=['Identity', 'Length'], ascending=[False, False] )
col = pdata_temp.columns.tolist()
col = col[-1:] + col[:-1]
pdata_temp = pdata_temp[col]
pdata = pdata_temp
# pdata.to_csv('{0}/{1}.idmat.sort.txt'.format( rst_dir, fasta_name.split('/')[-1]),
# sep='\t', encoding='utf-8', float_format='%4.2f', index=False )
os.system('rm {0}/{1}.idmat.txt'.format(rst_dir, fasta_name.split('/')[-1]))
Data = pdata[['Kinase','Length','Identity','Similarity']].iloc[:,:].to_numpy()
return Data
##########################################################################
## Check the input PDB list, first read in the PDB file to identify
## the number of chains and if the chain has at least 235aa. For those
## could be kinase (>235aa), output a new PDB file with chain_id
def CheckInputStructures( pdb ):
pdb_name = pdb.split('/')[-1]
pdb_id = pdb_name.split('.')[0]
hom_dir = pdb.split('{0}'.format(pdb_name))[0]
m = p.get_structure(pdb_id, pdb)
print('\n #1# Superpose Info: Input PDB {0} has {1:2d} chain(s)'.format(
pdb_name, len(m.get_chains()) ))
Targets = []
# get individual chains in PDB and check
for chain in m.get_chains():
chain_id = chain.get_id()
Res = chain.get_residues()
if len([r for r in Res if not re.search(r'H_|W', r.get_id()[0])]) < 220:
print('\n \033[31m#2# Superpose Warning:\033[0m {0}_{1} has < 220 residues, unlikely a kinase. Skip this chain.'.format( pdb_id, chain_id ))
else:
if re.search(r'_', pdb_id):
if re.search('{}'.format(chain_id), pdb_id.split('_')[-1] ):
new_pdb = '{0}/{1}.pdb'.format(hom_dir, pdb_id, chain_id)
else:
new_pdb = '{0}/{1}_{2}.pdb'.format(hom_dir, pdb_id, chain_id)
else:
new_pdb = '{0}/{1}_{2}.pdb'.format(hom_dir, pdb_id, chain_id)
w = PDBIO()
w.set_structure(chain)
w.save(new_pdb)
Targets.append(new_pdb)
return Targets
##########################################################################
## Create a cache database of known Xtal PDB alignments
## Reformat the FASTA header, PDB naming is split by '|', convert ':' to '_'
## Remove 'description' or it will mess up the fasta header
def CacheSeqDatabase( fasta_file ):
# print('\n \033[34m## Caching sequence database:\033[0m '+fasta_file+'\n')
Database = {}
for seq_record in SeqIO.parse(fasta_file, 'fasta'):
new_id = seq_record.id.split('/')[0].split('|')[0].replace(':', '_')
seq_record.description = ''
seq_record.id = new_id
Database[new_id] = seq_record
return Database
###########################################################################
# Generate the FASTA sequence from the PDB structure, using BioPython
def FASTA_Gen( pdb_name, pdb_id ):
# print('\n ## Convert PDB into FASTA for: \033[31m{0} - {1}\033[0m\n'.format(pdb_name, pdb_id))
peptide = PPBuilder().build_peptides( p.get_structure(pdb_id, pdb_name) )
seq = ''
for residue in peptide:
seq = seq + residue.get_sequence()
seq_obj = SeqRecord( seq, id=pdb_id, description='' )
return seq_obj
##########################################################################
## Muscle to perform profile alignment
## Alignment using a pre-existing MSA profile. T-Coffee has issue with this.
## MUSCLE and ClustalO came out about same time, 2004 and 2003, MUSCLE performs
## the best especially when doing single-seq profile alignment. Output the
## profile-aligned fasta object
## Penalities for gap opening and extension are modified to enforce the new
## alignment adopt the same gapping as the profile sequence. See examples:
## http://www.drive5.com/muscle/muscle_userguide3.8.html
## https://www.dnastar.com/manuals/MegAlignPro/15.3/en/topic/muscle-alignment-options
def MuscleProfileAlignment( profile_fasta, target_fasta, output_fasta ):
os.system('muscle -profile -in1 {0} -in2 {1} -out {2} -maxiters 64 -seqtype protein -gapopen -5.0 -gapextend -2.0 -center 0.0 -quiet'.format(
profile_fasta, target_fasta, output_fasta ))
##########################################################################
# Reformat and no alignment: Remove empty gap column from aligned FASTA file
# the '-action +rm_gap <% empty>' tag indicate which columns to remove, if
# column contains <% empty> seq that are empty in that column
# if not include '-action +rm_gap <>' flag, all '-' will be removed
def RemoveFastaGapColumn( fasta_input, fasta_output ):
os.system('t_coffee -other_pg seq_reformat -in "{0}" -action +rm_gap 100 -output=fasta > {1}'.format(fasta_input, fasta_output))
##########################################################################
#
# v1.0 18.03.11
# v2.0 20.01.12
#
#