#!/usr/bin/env python # Find ORFs in DNA. # Put markup around them. from fileinput import input import string import sys import re _orfPattern = r'ATG(...)*?(TAG|TGA|TTA)' _orf = re.compile( _orfPattern ) def findOrfs( theSeq ): """Return a list of the ORFs. >>> findOrfs( "" ) [] >>> findOrfs( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" ) [] >>> findOrfs( "AAATGAATAATAATGATTGAGGGG" ) [(2, 17)] >>> findOrfs( "AAATGAATAATAATGATTGAGGGATGTAG" ) [(2, 17), (23, 26)] """ answer = [] m = _orf.search( theSeq ) while ( m ): answer.append( ( m.start(), m.end()-3 ) ) m = _orf.search( theSeq, m.start()+3 ) return answer def markOrfs( theSeq ): """Mark up sequence data with ORF data. >>> markOrfs( "AAATGAATAATAATGATTGAGGG" ) 'AAATGAATAATAATGATTGAGGG' """ a = findOrfs( theSeq ) answer = "" marker = 0; for p in a: if p[0] < marker: continue answer += theSeq[ marker: p[0] ] marker = p[0] + 3 answer += '%s%s%s' \ % ( theSeq[ p[0]: marker ], theSeq[ marker: p[1] ], theSeq[ p[1]: p[1]+3 ] ) marker = p[1] + 3 answer += theSeq[ marker: ] return answer #_codingRegion = re.compile( r'(?<=TATA.{1,10})ATG(...)*(TAG|TGA|TTA)' ) # sre_constants.error: look-behind requires fixed-width pattern _codingRegion = re.compile( r'TATA.{1,30}(%s)' % _orfPattern ) def findCodingRegions( theSeq ): """Return a list of the ORFs that might be coding regions. >>> src = ''' ... GACTATATCTTATAAACCAATGGTACCAAAACTGGAGCGTCGATGTCCGGAAATTGATAT ... TCTGTTCCTGACCGGGAGCTCCGCCGGAAGATTTTCACCGTATACTCGGGCACCAGGAGA ... TCTGCATGGAAGAGCGATGTGTCGACCGTTCCGTGGCATAACTCCGCCTAGCGGCAAAGC ... GAAGTTGTTTGACAGGTGATGAAAAAAACAAAAAATGTCGAACGAGTAGCGTGCATAATG ... AGTGCGTCAATGCTGTATTTCTGATCGTGGCGTGAATACGCGTGGTACGCCGCAAAAGCC ... TTATGTGCAGGAGCACTGGCGGCATCTTCTCTGTCATTTTGCCTTCAAAACG ... ''' >>> src = src.replace('\\n', '') >>> findCodingRegions( src ) [(19, 199), (42, 57), (125, 275)] """ answer = [] mStart = 0 m = _codingRegion.search( theSeq, mStart ) while ( m ): answer.append( m.span( 1 ) ) mStart = m.start() + 3 m = _codingRegion.search( theSeq, mStart ) return answer def markCodingRegions( theSeq ): """Mark up sequence data with coding region data. 123456789 123456789 123456789 123456789 123456789 123456789 >>> src = ''' ... GACTATATCTTATAAACCAATGGTACCAAAACTGGAGCGTCGATGTCCGGAAATTGATAT ... TCTGTTCCTGACCGGGAGCTCCGCCGGAAGATTTTCACCGTATACTCGGGCACCAGGAGA ... TCTGCATGGAAGAGCGATGTGTCGACCGTTCCGTGGCATAACTCCGCCTAGCGGCAAAGC ... GAAGTTGTTTGACAGGTGATGAAAAAAACAAAAAATGTCGAACGAGTAGCGTGCATAATG ... AGTGCGTCAATGCTGTATTTCTGATCGTGGCGTGAATACGCGTGGTACGCCGCAAAAGCC ... TTATGTGCAGGAGCACTGGCGGCATCTTCTCTGTCATTTTGCCTTCAAAACG ... ''' >>> src = src.replace('\\n', '') >>> html = markCodingRegions( src ) >>> html[:7] 'GACTATA' >>> html[-9:] 'TTCAAAACG' >>> len( html ) 463 >>> html[18:96] 'AATGG' >>> html[266:313] 'AGGTGATGA' >>> html[314:].index( '<' ) Traceback (most recent call last): ... ValueError: substring not found """ a = findCodingRegions( theSeq ) answer = "" marker = 0; for p in a: if p[0] < marker: continue answer += theSeq[ marker: p[0] ] marker = p[0] + 3 answer += '%s%s%s' \ % ( theSeq[ p[0]: marker ], theSeq[ marker: p[1]-3 ], theSeq[ p[1]-3: p[1] ] ) marker = p[1] answer += theSeq[ marker: ] return answer def _markEm( theInput ): for line in theInput: print markCodingRegions( line ) def _testdoc(): import doctest doctest.testmod() if __name__ == '__main__': if 1 < len( sys.argv ): if "doctest" == sys.argv[1]: _testdoc() else: try: f = open( sys.argv[1] ) except IOError, e: print e else: _markEm( f ) f.close() else: _markEm( input() )