# Converts peptide sequences in fasta format to single strings. # Strips comments and headings, and then prints each sequence, one to a line. # We expect IUPAC peptide codes (upper or lower case). # Two other character codes are common. # * is often used for stop codons. # X is often used to mean no other aa code. # More search programs can handle X than *. my $seq; sub loadOneSeq( $ ) { ( my $identifier ) = @_; # $seq =~ s/\*$//; # strip any trailing stop. # $seq =~ s/X$//; # strip any trailing stop. print "$identifier\t"; print $seq; print "\n"; $seq = nil; } my $seqid; while ( ) { chomp; if ( /^>(\S+)/ ) { if ( $seq ) { loadOneSeq( $seqid ) } $seqid = $1; } elsif ( /^;/ ) { # Skip comments. } # Note that it is slightly faster to check for an illegal character # than for a string of legal ones. # elsif ( /[^AC-IK-NP-TVWY]/i ) { # Strict IUPAC codes only. # elsif ( /[^A-IK-NP-TVWYZ]/i ) { # Allow IUPAC "ambiguity" codes B and Z. elsif ( /[^A-IK-NP-TV-Z*]/i ) { # Allow ambiguity codes and non-IUPAC codes X and *. # elsif ( /[^A-IK-NP-Z*]/i ) { # Allow U also which is sometimes used for seleneomethionine. # elsif ( /[^A-IK-Z*]/i ) { # Allow O also which is sometimes used for ornithine. print STDERR "ERROR in sequence $seqid\n"; print STDERR "$_\n"; # Should also null out seq and skip to next seq header. # Or should we just quit? die; } else { # Note this also matches blank lines. Is that OK? # s/\*/X/g; # Convert stops? $seq .= $_; } }