"""
.. codeauthor:: Niklaus Johner <niklaus.johner@a3.epfl.ch>
This module contains basic functions to work with sequences, notably to find motifs,
and build a position specific scoring matrix from an alignment.
"""
__all__=('FindMotif','CreateSequenceFromView')
import ost as _ost
import random
[docs]def CreateSequenceFromView(eh,seq_name):
"""
Returns the sequence of the view *eh*, named *seq_name*.
"""
s=''
for r in eh.residues:
s+=conop.ResidueNameToOneLetterCode(r.name)
return seq.CreateSequence(seq_name,s)
[docs]def FindMotif(motif,sequence):
s=sequence.GetGaplessString()
start_list=[]
for i in range(0,len(s)-len(motif)):
start=False
for el in motif[0]:
if s[i]==el:
start=True
break
if not start:continue
for j in range(1,len(motif)):
if 'X' in motif[j]:
flag=True
continue
flag=False
for el in motif[j]:
if s[i+j]==el:
flag=True
break
if not flag:break
if flag:start_list.append(i)
return start_list
def FindOneOfSeveralMotifs(motif_list,sequence):
for i,motif in enumerate(motif_list):
start_list=FindMotif(motif,sequence)
if len(start_list)>0:return start_list,i
return None
def ShuffleSequence(sequence,fix_gaps=True):
if not fix_gaps:
s=[el for el in sequence]
random.shuffle(s)
s="".join(s)
return seq.CreateSequence(sequence.name,s)
non_gap_indices=[]
s=[el for el in sequence if not el in ["-","?"]]
random.shuffle(s)
s2=""
c=0
for i,el in enumerate(sequence):
if not el in ["-","?"]:
s2+=s[c]
c+=1
else:s2+="-"
return seq.CreateSequence(sequence.name,s2)
def RandomizeAlignment(ali,fix_gaps=True):
ali2=seq.CreateAlignment()
for s in ali.sequences:
s2=ShuffleSequence(s,fix_gaps)
ali2.AddSequence(s2)
return ali2
class PSSM:
"""
Class to create a position specific scoring matrix from a multiple sequence alignment.
The object can then be used to score a sequence.
"""
def __init__(self,ali):#,aa_list=None,pseudocounts=None,aa_frequencies=None):
self.aa_list=['D','E','R','K','H','S','T','N','Q','G','P','Y','W','V','I','L','M','F','A','C']
self.aa_index_dict={}
for i,aa in enumerate(self.aa_list):
self.aa_index_dict[aa]=i
self.aa_frequencies={"A": 0.09398206173005932, "C": 0.014640847892538094, "E": 0.06167898538341536, "D": 0.057537026726662464, "G": 0.07661345622483157, "F": 0.04125062078013584, "I": 0.05988964354148949, "H": 0.023180301946789893, "K": 0.048775277841393995, "M": 0.022045733851907126, "L": 0.09741587583495241, "N": 0.03822084621874947, "Q": 0.03472639486143553, "P": 0.043952778781438454, "S": 0.060691166121181965, "R": 0.05494787451396235, "T": 0.05250697453686419, "W": 0.012220035049334168, "V": 0.07323789898435108, "Y": 0.032486199178507216}
self.ncol=ali.GetLength()
self.ali_depth=ali.GetCount()
nres=len(self.aa_list)
self.matrix=npy.zeros([self.ncol,nres])
print "generating pssm from alignment with {0} col and {1} sequences".format(self.ncol,self.ali_depth)
for i,aa in enumerate(self.aa_list):
self.matrix[:,i]=nres*self.aa_frequencies[aa]
for i,col in enumerate(ali):
for j,aa in enumerate(self.aa_list):
self.matrix[i,j]+=str(col).count(aa)
self.matrix[i,:]/=float(npy.sum(self.matrix[i,:]))
for i,col in enumerate(ali):
for j,aa in enumerate(self.aa_list):
#print self.matrix[i,j]/self.aa_frequencies[aa]
self.matrix[i,j]=math.log(self.matrix[i,j]/self.aa_frequencies[aa])
def ScoreSequence(self,s,start_index):
score=0
for i,c in enumerate(s):
if not c in self.aa_index_dict:continue
score+=self.matrix[i+start_index,self.aa_index_dict[c]]
return score