We use the requests
library to fetch the data within Python.
The next command starting with !
only works within "jupyter", the tool I use for the scripts. You might need to install requests
differently.
!pip install requests
import requests
def fetch_data():
"""fetches codon table as text over internet"""
# verify=False to circumvent https security issues, may cause a InsecureRequestWarning.
data = requests.get("https://siscourses.ethz.ch/python_dbiol/data/codon_table.txt", verify=False)
return data.text
# you might see a warning if you run the following line, you can ignore this warning:
# print(fetch_data()[:300])
Below I use Pythons feature to consider empty strings as False
. (Same for empty lists, tuples, dictionaries, sets as well as values 0
, 0.0
and None
):
def build_mapping(text):
"""builds the mapping rna codon -> aa symbol based on the
downloaded text file.
the code is very dependend on the actual formatting of the text"""
mapping = {}
lines = text.split("\n")
for line in lines[1:]: # skip header
line = line.strip() # remove trailing "\n"
if line: # only consider non empty lines
fields = line.split(" ")
# intersting data is in columns 5/6, 10/11, and so on:
for index in range(0, len(fields), 5):
mapping[fields[index]] = fields[index + 1]
return mapping
# print(build_mapping(fetch_data()))
def read_fasta(path):
"""reads a fasta file from the given path.
returns a list of tuples. the first entry of every tuple is the identifier line
the second entry is the actual sequence
implementation: every time we see a status line we store the previously seen
sequence (if there was one).
"""
sequences = []
last_sequence = ""
with open(path, "r") as fh:
for line in fh:
line = line.rstrip() # remove trailing \n
if line.startswith(">"):
if last_sequence:
# record what we have seen so far:
sequences.append((status, last_sequence))
last_sequence = ""
status = line
else:
last_sequence += line
# don't ignore the last entry:
if last_sequence:
sequences.append((status, last_sequence))
return sequences
# print(read_fasta("rna_fake.fasta"))[:3]
def translate_rna_to_aa(rna_sequence, mapping):
result = []
# split sequence into codons
for i in range(0, len(rna_sequence), 3):
codon = rna_sequence[i: i + 3]
aa = mapping.get(codon, "*") # "*" if codon is not a key in mapping
result.append(aa)
return "".join(result)
# mapping = build_mapping(fetch_data())
# sequence = read_fasta("rna_fake.fasta")[0][1]
# translate_rna_to_aa(sequence, mapping)
I build the result file line by line using a list of strings. Finally the result is constructed with the join
method of strings:
def translate_fasta_file(in_path, out_path):
mapping = build_mapping(fetch_data())
sequences = read_fasta(in_path)
result_lines = []
for identifier, rna_sequence in sequences:
aa_sequence = translate_rna_to_aa(rna_sequence, mapping)
result_lines.append(identifier)
result_lines.append(aa_sequence)
with open(out_path, "w") as fh:
fh.write("\n".join(result_lines))
translate_fasta_file("rna_fake.fasta", "aa_fake.fasta")
# first ten lines, only works within jupyter
!head aa_fake.fasta
Here comes a modified version which saves memory by not reading all sequences first into the memory, instead sequences are read "on demand". This allows processing of very hughe files which would not fit into your computers memory.
The implemenatation below uses so called "generators" which are explained in the proposed solution for the "sum formla fit" challenge.
def read_fasta_generator(path):
"""reads a fasta file from the given path.
returns a list of tuples. the first entry of every tuple is the identifier line
the second entry is the actual sequence
implementation: every time we see a status line we store the previously seen
sequence (if there was one).
"""
sequences = []
last_sequence = ""
with open(path, "r") as fh:
for line in fh:
line = line.rstrip() # remove trailing \n
if line.startswith(">"):
if last_sequence:
# THIS LINE IS DIFFERENT TO THE PREVIOUS VERSION
yield (status, last_sequence)
last_sequence = ""
status = line
else:
last_sequence += line
# don't ignore the last entry:
if last_sequence:
yield (status, last_sequence)
# uncomment for testing:
#
# for status, seq in read_fasta_generator("rna_fake.fasta"):
# print(status)
# print(seq)
# break
def translate_fasta_file_efficient(in_path, out_path):
"""
this solution avoids holding all data in memory so we
can process FASTA files which do not fit into memory !
"""
mapping = build_mapping(fetch_data())
sequences = read_fasta(in_path)
with open(out_path, "w") as fh:
for identifier, rna_sequence in read_fasta_generator(in_path):
aa_sequence = translate_rna_to_aa(rna_sequence, mapping)
print(identifier, file=fh)
print(aa_sequence, file=fh)
translate_fasta_file_efficient("rna_fake.fasta", "aa_fake.fasta")
# first ten lines, only works within jupyter
!head aa_fake.fasta