You will see a long string with mysterious symbols because every file is a string of symbols, nevertheless if is is a Word document, Excel sheet or Python program. It is up to the associated application to interpret those symbols when opening a file. For example a Word document is much more than only the typed text but contains formatting and structure information and a plain text file only containing the pure text could not represent this.
acc = 0
fh = open("numbers.txt", "r")
for line in fh:
acc += int(line.rstrip())
fh.close()
print(acc)
fh = open("short.fasta", "r")
for line in fh:
if line[0] == ">":
print(line.rstrip())
fh.close()
acc = 0
with open("numbers.txt", "r") as fh:
for line in fh:
acc += int(line.rstrip())
print(acc)
with open("short.fasta", "r") as fh:
for line in fh:
if line[0] == ">":
print(line.rstrip())
with open("status_lines.txt", "w") as fh_out:
with open("short.fasta", "r") as fh_in:
for line in fh_in:
if line[0] == ">":
print(line.rstrip(), file=fh_out)
If you want to understand how a program works, or why a program does not work as intended you can trace the flow of execution and the the current state of variables by inserting appropriate print
function calls:
with open("short.fasta", "r") as fh:
for line in fh:
line = line.rstrip()
print()
print("line:", line)
if len(line) > 0 and line[0] == ">":
last_status = line
count = 0
elif line == "":
print("COUNT:", count, last_status)
else:
count += len(line)
print("last_status:", last_status)
print("count:", count)
I ommited the long output.
The plain print()
creates an empty line, this enhances the readability. Further I marked the "regular" output with COUNT:
to distinguish this from the other lines.
with open("fasta_stat.csv", "w") as fh_csv:
# write header
print("description, symbol_count, gc content", file=fh_csv)
with open("short.fasta", "r") as fh_fasta:
for line in fh_fasta:
line = line.rstrip()
if len(line) > 0 and line[0] == ">":
last_status = line
count = 0
gc_count = 0
elif line == "":
relative_gc_count = gc_count / count
# for checking if output is as intended
print(last_status, ",", count, ",", relative_gc_count)
# write one line to csv file
print(last_status, ",", count, ",", relative_gc_count, file=fh_csv)
else:
count += len(line)
gc_count += line.count("G") + line.count("C")
First we create a suitable FASTA file without the empty lines:
with open("short_2.fasta", "w") as fh_out:
with open("short.fasta", "r") as fh_in:
for line in fh_in:
line = line.rstrip()
if line != "":
print(line, file=fh_out)
This was the previous solution:
with open("short.fasta", "r") as fh:
for line in fh:
line = line.rstrip()
if len(line) > 0 and line[0] == ">":
last_status = line
count = 0
elif line == "":
print(count, last_status)
else:
count += len(line)
For the new file we could display count
and last_status
in the code block after if len(line) > 0 and line[0] == ">":
but for the very first status line count
and last_status
are not set.
To handle this issue we use an indicator value -1
for count
which will be set to 0
after handling the first sequence description:
count = -1
with open("short.fasta", "r") as fh:
for line in fh:
line = line.rstrip()
if len(line) > 0 and line[0] == ">":
if count >= 0: # this is False for the first status line, but True for all others
print(count, last_status)
last_status = line
count = 0
else:
count += len(line)
This is still not correct because we do not see the information about the last sequence. As the last line is not a status line we have to handle this situation at the end of the script:
count = -1
with open("short.fasta", "r") as fh:
for line in fh:
line = line.rstrip()
if len(line) > 0 and line[0] == ">":
if count >= 0: # this is False for the first status line, but True for all others
print(count, last_status)
last_status = line
count = 0
else:
count += len(line)
print(count, last_status)
Alternative solutions would be:
first_status_line_seen
which is initalizied with False
and set to True
after the first status line was read. Then call print(count, last_status)
if this value is True
.print
depending on this value.import csv
with open("fasta_stat.csv", "w") as fh_csv:
csv_writer = csv.writer(fh_csv)
csv_writer.writerow(["description", "symbol_count", "gc content"])
with open("short.fasta", "r") as fh_fasta:
for line in fh_fasta:
line = line.rstrip()
if len(line) > 0 and line[0] == ">":
last_status = line
count = 0
gc_count = 0
elif line == "":
relative_gc_count = gc_count / count
print(count, gc_count, last_status)
csv_writer.writerow([last_status, count, relative_gc_count])
else:
count += len(line)
gc_count += line.count("G") + line.count("C")
import csv
with open("amino_acids.csv", "r") as fh:
for line in csv.reader(fh, delimiter=","):
print(line)
import csv
count = 0
mono_sum = 0.0
with open("amino_acids.csv", "r") as fh:
r = csv.reader(fh, delimiter=",")
next(r)
for line in r:
mono_sum += float(line[3])
count += 1
print("average monoisotopic mass is", mono_sum / count)
import csv
count = 0
mono_sum = 0.0
max_mass = 0
aa_with_max_mass = ""
with open("amino_acids.csv", "r") as fh:
r = csv.reader(fh, delimiter=",")
header = next(r)
for line in r:
mono_mass = float(line[3])
if mono_mass > max_mass:
max_mass = mono_mass
aa_with_max_mass = line[1]
mono_sum += mono_mass
count += 1
print("average monoisotopic mass is", mono_sum / count)
print("max monoisotopic mass is", max_mass)
print("the aa with max mass is", aa_with_max_mass)
(asking the user until he enters only one sympol was not part of the exercise desciption, I added it just for demonstration purposes):
while True:
one_letter_code = input("tell me a one letter code: ").upper()
if len(one_letter_code) != 1:
print("this is not a one letter input, try again")
else:
break
found = False
with open("amino_acids.csv", "r") as fh:
r = csv.reader(fh, delimiter=",")
next(r)
for line in r:
if line[0] == one_letter_code:
print(one_letter_code, "has monoisotopic mass", line[3], ", its molecular formula is", line[2])
found = True
if not found:
print("the given code is invalid")