"""
helper functions for nice formatting of mass formulas.
I tried to come up with small functions and to avoid
code duplication
"""
def format_one(letter, count):
""" (H, 0) -> "",
(H, 1) -> "H",
(H, 2) -> "H2"
"""
if count == 0:
return ""
elif count == 1:
return letter
else:
return letter + str(count)
def format_mf(ci, hi, oi):
"""builds a prettified mass formula for ci C atoms + hi H atoms + oi O atoms
input: ci is count of "C" atoms
hi is count of "H" atoms
oi is count of "O" atoms
return: nicely formatted molecular sum formula
"""
# we call such checks also "defensive programming":
assert ci >= 0
assert hi >= 0
assert oi >= 0
return format_one("C", ci) + format_one("H", hi) + format_one("O", oi)
def format_mf_2(ci, hi, oi):
"""alternative implementation based on an idea from course attendee"""
mf = "C{}".format(ci) + "H{}".format(hi) + "O{}".format(oi)
mf = mf.replace("C0", "").replace("C1", "C")
mf = mf.replace("H0", "").replace("H1", "H")
mf = mf.replace("O0", "").replace("O1", "O")
return mf
for ci in (0, 1, 2):
for hi in (0, 1, 2):
for oi in (0, 1, 2):
print(ci, hi, oi, format_mf_2(ci, oi, hi))
def formulas_in_range(mz_min, mz_max):
"""simple optimizations for upper bounds"""
assert mz_min > 0
assert mz_min <= mz_max
mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915
c_max = int(mz_max / mass_C)
result= []
for ci in range(c_max + 1):
h_max = int(mz_max / mass_H)
for hi in range(h_max + 1):
o_max = int(mz_max / mass_O)
for oi in range(o_max + 1):
mass = ci * mass_C + hi * mass_H + oi * mass_O
if mz_min <= mass <= mz_max:
result.append((mass, ci, hi, oi))
return result
for mass, ci, hi, ho in formulas_in_range(100, 100.1):
mf = format_mf(ci, hi, ho)
print("{:15s} {:f}".format(mf, mass))
Some extra checks for the upper bound computations:
mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915
print(formulas_in_range(mass_H, mass_H))
print(formulas_in_range(mass_C, mass_C))
print(formulas_in_range(mass_O, mass_O))
We reimplement our solution using a so called generator. A generator looks like a function, but the body of the function contains at least one yield
statement.
This generators allow implementing your own iterators. You remember iterators ?
# range returns an iterator
for i in range(4):
print(i, end=" ")
print()
# strings are iterators
for c in "abcd":
print(c, end=", ")
print()
# lists also
for x in ["u", "v", "w"]:
print(x, end="; ")
# and dicts (iterates over keys), tuples, file handles, etc....
Here we implement a generator which yields values 1, 2, 3 when used as an iterator.
def example_gen():
yield 1
yield 2
yield 3
for i in example_gen(): # () needed !
print(i, end=" ")
If we create the iterator calling my_gen()
no code in the body of this generator is executed. Every iteration executes code until the next yield
and the value after yield
is "returned" to the loop using the generator:
def my_gen():
print(" >> this is my_gen, execution begins")
yield 1
print(" >> this is my_gen after yield 1")
yield 2
print(" >> this is my_gen after yield 2")
# look at the output below and match the lines to
# the code.
# only construction, no execution of the body:
generator = my_gen()
print("top level after calling my_gen()")
for element in generator:
print("top level:", element)
Generators are special cases of iterators, so we can also pass them to list
, set
, ...:
print(list(my_gen()))
print(set(my_gen()))
for (i, v) in enumerate(my_gen()):
print(i, v)
So we can reimplement the built-in enumerate
method like this:
def my_enumerator(iterator):
i = 0
for value in iterator:
yield (i, value)
i += 1
for i, v in my_enumerator("ab"):
print(i, v)
print()
for i, v in my_enumerator(range(2)):
print(i, v)
print()
for i, v in my_enumerator(["b", "c"]):
print(i, v)
def formulas_in_range(mz_min, mz_max):
"""simple optimizations for upper bounds"""
assert mz_min > 0
assert mz_min <= mz_max
mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915
c_max = int(mz_max / mass_C)
for ci in range(c_max + 1):
h_max = int(mz_max / mass_H)
for hi in range(h_max + 1):
o_max = int(mz_max / mass_O)
for oi in range(o_max + 1):
mass = ci * mass_C + hi * mass_H + oi * mass_O
if mz_min <= mass <= mz_max:
yield mass, ci, hi, oi
for mass, ci, hi, oi in formulas_in_range(100, 100.1):
mf = format_mf(ci, hi, oi)
print("{:15s} {:f}".format(mf, mass))
# same checks as above
mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915
print(list(formulas_in_range(mass_H, mass_H)))
print(list(formulas_in_range(mass_C, mass_C)))
print(list(formulas_in_range(mass_O, mass_O)))
def formulas_in_range_optimized(mz_min, mz_max):
""" better optimization for upper bounds and for
lower bound of most inner loop"""
assert mz_min > 0
assert mz_min <= mz_max
mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915
c_max = int(mz_max / mass_C)
for ci in range(c_max + 1):
mass_explained_c = mass_C * ci
h_max = int((mz_max - mass_explained_c) / mass_H)
for hi in range(h_max + 1):
mass_explained_ch = mass_explained_c + hi * mass_H
o_max = int((mz_max - mass_explained_ch) / mass_O)
o_min = int((mz_min - mass_explained_ch) / mass_O)
for oi in range(o_min, o_max + 1):
mass = mass_explained_ch + oi * mass_O
if mz_min <= mass <= mz_max:
yield mass, ci, hi, oi
for mass, ci, hi, oi in formulas_in_range(100, 100.1):
mf = format_mf(ci, hi, oi)
print("{:15s} {:f}".format(mf, mass))
# same checks as above
mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915
print(list(formulas_in_range_optimized(mass_H, mass_H)))
print(list(formulas_in_range_optimized(mass_C, mass_C)))
print(list(formulas_in_range_optimized(mass_O, mass_O)))
# quick demo time module:
import time
print("seconds since 1st jan 1970 is", time.time())
started = time.time()
time.sleep(1.2)
print("sleep needed {} seconds".format(time.time() - started))
We run the function multiple times for multiple mz bounds. For proper measurement we use median of multiple time measurements to compensate the influence of background computations on your machine.
Exercise: also compute the std deviations !
# compare methods
import time
# since Python 3.4, see https://docs.python.org/3/library/statistics.html:
import statistics
def measure_time(mf_generator, mz_min, mz_max, n_runs=7):
"""this function helps us to avoid code
duplication for benchmarking different generators"""
times = []
for __ in range(n_runs):
started = time.time()
# we must use list here to exhaust the generator !
result = list(mf_generator(mz_min, mz_max))
needed = time.time() - started
times.append(needed)
return len(result), statistics.median(times)
for mz_min, mz_max in [(100, 100.1), (100, 101), (500, 501), (1000, 1000.1), (1000, 1001)]:
n_naive, t_naive = measure_time(formulas_in_range, mz_min, mz_max)
n_optimized, t_optimized = measure_time(formulas_in_range_optimized, mz_min, mz_max)
# another defensive check to be sure that our optimization did not discard
# results:
assert n_naive == n_optimized
print("found {} formulas in range {}..{}".format(n_naive, mz_min, mz_max))
print("naive method needs {:.2e} seconds, optimized needs {:.2e} seconds".format(t_naive, t_optimized))
print("speed up is {:.2f}".format(t_naive / t_optimized))
print()