#!/usr/bin/env python # Find ORFs in DNA. # Put markup around them. from fileinput import input import string import sys import re #codingRegion = re.compile( r'(?<=TATA.{1,10})ATG(...)*(TAG|TGA|TTA)' ) # sre_constants.error: look-behind requires fixed-width pattern #codingRegion = re.compile( r'(?<=TATA...)ATG(...)*(TAG|TGA|TTA)' ) # includes stop codons in the result because matches longest match. #codingRegion = re.compile( r'(?<=TATA...)ATG(...)*?(TAG|TGA|TTA)' ) _orfPattern = r'ATG(...)*?(TAG|TGA|TTA)' _orf = re.compile( _orfPattern ) #codingRegion = re.compile( r'TATA.{1,30}(ATG(...)*?(TAG|TGA|TTA))' ) _codingRegion = re.compile( r'TATA.{1,30}(%s)' % _orfPattern ) def findOrfs( theSeq ): """Return a list of the ORFs that might be coding regions.""" answer = [] m = _codingRegion.search( theSeq ) while ( m ): answer.append( ( m.start(), m.end()-3 ) ) m = _codingRegion.search( theSeq, m.start()+3 ) return answer def findCodingRegions( theSeq ): """Return a list of the ORFs that might be coding regions.""" answer = [] mStart = 3000 m = _codingRegion.search( theSeq, mStart ) while ( m ): answer.append( m.span( 1 ) ) mStart = m.start() + 3 m = _codingRegion.search( theSeq, mStart ) return answer if __name__ == '__main__': for line in input(): a = findCodingRegions( line ) marker = 0; for p in a: print line[ marker: p[0] ] marker = p[0] + 3 print '%s%s%s' \ % ( line[ p[0]: marker ], line[ marker: p[1]-3 ], line[ p[1]-3: p[1] ] ) marker = p[1] print line[ marker: ]