Solution using lists

"""
helper functions for nice formatting of mass formulas.

I tried to come up with small functions and to avoid
code duplication
"""

def format_one(letter, count):
    """ (H, 0) -> "",
        (H, 1) -> "H",
        (H, 2) -> "H2"
    """
    if count == 0:
        return ""
    elif count == 1:
        return letter
    else:
        return letter + str(count)

    
def format_mf(ci, hi, oi):
    """builds a prettified mass formula for ci C atoms + hi H atoms + oi O atoms
    
    input: ci is count of "C" atoms
           hi is count of "H" atoms
           oi is count of "O" atoms

    return: nicely formatted molecular sum formula
    """
    
    # we call such checks also "defensive programming":
    assert ci >= 0
    assert hi >= 0
    assert oi >= 0
    return format_one("C", ci) + format_one("H", hi) + format_one("O", oi)


def format_mf_2(ci, hi, oi):
    """alternative implementation based on an idea from course attendee"""
    mf = "C{}".format(ci) + "H{}".format(hi) + "O{}".format(oi)
    mf = mf.replace("C0", "").replace("C1", "C")
    mf = mf.replace("H0", "").replace("H1", "H")
    mf = mf.replace("O0", "").replace("O1", "O")
    return mf 

for ci in (0, 1, 2):
    for hi in (0, 1, 2):
        for oi in (0, 1, 2):
            print(ci, hi, oi, format_mf_2(ci, oi, hi))
0 0 0 
0 0 1 H
0 0 2 H2
0 1 0 O
0 1 1 HO
0 1 2 H2O
0 2 0 O2
0 2 1 HO2
0 2 2 H2O2
1 0 0 C
1 0 1 CH
1 0 2 CH2
1 1 0 CO
1 1 1 CHO
1 1 2 CH2O
1 2 0 CO2
1 2 1 CHO2
1 2 2 CH2O2
2 0 0 C2
2 0 1 C2H
2 0 2 C2H2
2 1 0 C2O
2 1 1 C2HO
2 1 2 C2H2O
2 2 0 C2O2
2 2 1 C2HO2
2 2 2 C2H2O2
def formulas_in_range(mz_min, mz_max):
    """simple optimizations for upper bounds"""

    assert mz_min > 0
    assert mz_min <= mz_max
    
    mass_C = 12.0
    mass_H = 1.0078250319
    mass_O = 15.994915
    
    c_max = int(mz_max / mass_C)
    
    result= []
    
    for ci in range(c_max + 1):
   
        h_max = int(mz_max / mass_H)
        for hi in range(h_max + 1):
        
            o_max = int(mz_max / mass_O)
            for oi in range(o_max + 1):
            
                mass = ci * mass_C + hi * mass_H + oi * mass_O
                
                if mz_min <= mass <= mz_max:
                    result.append((mass, ci, hi, oi))
    return result


for mass, ci, hi, ho in formulas_in_range(100, 100.1):
    mf = format_mf(ci, hi, ho)
    print("{:15s} {:f}".format(mf, mass))
H4O6            100.000790
CH8O5           100.037175
C2H12O4         100.073560
C4H4O3          100.016045
C5H8O2          100.052430
C6H12O          100.088815
C8H4            100.031300

Some extra checks for the upper bound computations:

mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915


print(formulas_in_range(mass_H, mass_H))
print(formulas_in_range(mass_C, mass_C))
print(formulas_in_range(mass_O, mass_O))
[(1.0078250319, 0, 1, 0)]
[(12.0, 1, 0, 0)]
[(15.994915, 0, 0, 1)]

Generators in Python

We reimplement our solution using a so called generator. A generator looks like a function, but the body of the function contains at least one yield statement.

This generators allow implementing your own iterators. You remember iterators ?

# range returns an iterator
for i in range(4):
    print(i, end=" ")
print()

# strings are iterators
for c in "abcd":
    print(c, end=", ")
print()    

# lists also
for x in ["u", "v", "w"]:
    print(x, end="; ")
    
# and dicts (iterates over keys), tuples, file handles, etc....
0 1 2 3 
a, b, c, d, 
u; v; w; 

Here we implement a generator which yields values 1, 2, 3 when used as an iterator.

def example_gen():
    yield 1
    yield 2
    yield 3

for i in example_gen():  # () needed !
    print(i, end=" ")
1 2 3 

If we create the iterator calling my_gen() no code in the body of this generator is executed. Every iteration executes code until the next yield and the value after yield is "returned" to the loop using the generator:

def my_gen():
    print("  >> this is my_gen, execution begins")
    yield 1
    print("  >> this is my_gen after yield 1")
    yield 2
    print("  >> this is my_gen after yield 2")

# look at the output below and match the lines to
# the code.

# only construction, no execution of the body:
generator = my_gen()
print("top level after calling my_gen()")

for element in generator:
    print("top level:", element)
top level after calling my_gen()
  >> this is my_gen, execution begins
top level: 1
  >> this is my_gen after yield 1
top level: 2
  >> this is my_gen after yield 2

Generators are special cases of iterators, so we can also pass them to list, set, ...:

print(list(my_gen()))
  >> this is my_gen, execution begins
  >> this is my_gen after yield 1
  >> this is my_gen after yield 2
[1, 2]
print(set(my_gen()))
  >> this is my_gen, execution begins
  >> this is my_gen after yield 1
  >> this is my_gen after yield 2
{1, 2}
for (i, v) in enumerate(my_gen()):
    print(i, v)
  >> this is my_gen, execution begins
0 1
  >> this is my_gen after yield 1
1 2
  >> this is my_gen after yield 2

So we can reimplement the built-in enumerate method like this:

def my_enumerator(iterator):
    i = 0
    for value in iterator:
        yield (i, value)
        i += 1
        
for i, v in my_enumerator("ab"):
    print(i, v)

print()
for i, v in my_enumerator(range(2)):
    print(i, v)

print()
for i, v in my_enumerator(["b", "c"]):
    print(i, v)   
0 a
1 b

0 0
1 1

0 b
1 c

Why generators ?

  • only compute the values "on demand" and process them as delivered without building a possibly huge list
  • example: many aggregations as min or max computations don't need the full list of values, a "stream" of values is enough
  • splitting a data processing pipeline by nesting generators can be very efficient in terms of memory.

Now the improved solution

def formulas_in_range(mz_min, mz_max):
    """simple optimizations for upper bounds"""

    assert mz_min > 0
    assert mz_min <= mz_max
    
    mass_C = 12.0
    mass_H = 1.0078250319
    mass_O = 15.994915
    
    c_max = int(mz_max / mass_C)
    
    for ci in range(c_max + 1):
   
        h_max = int(mz_max / mass_H) 
        for hi in range(h_max + 1):
        
            o_max = int(mz_max / mass_O)
            for oi in range(o_max + 1):
            
                mass = ci * mass_C + hi * mass_H + oi * mass_O
                
                if mz_min <= mass <= mz_max:
                    yield mass, ci, hi, oi
                    

for mass, ci, hi, oi in formulas_in_range(100, 100.1):
    mf = format_mf(ci, hi, oi)
    print("{:15s} {:f}".format(mf, mass))
H4O6            100.000790
CH8O5           100.037175
C2H12O4         100.073560
C4H4O3          100.016045
C5H8O2          100.052430
C6H12O          100.088815
C8H4            100.031300
# same checks as above

mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915


print(list(formulas_in_range(mass_H, mass_H)))
print(list(formulas_in_range(mass_C, mass_C)))
print(list(formulas_in_range(mass_O, mass_O)))
[(1.0078250319, 0, 1, 0)]
[(12.0, 1, 0, 0)]
[(15.994915, 0, 0, 1)]
def formulas_in_range_optimized(mz_min, mz_max):
    """ better optimization for upper bounds and for 
    lower bound of most inner loop"""
    
    assert mz_min > 0
    assert mz_min <= mz_max 
    
    mass_C = 12.0
    mass_H = 1.0078250319
    mass_O = 15.994915
    
    c_max = int(mz_max / mass_C)
    
    for ci in range(c_max + 1):
        
        mass_explained_c = mass_C * ci
        h_max = int((mz_max - mass_explained_c) / mass_H)
       
        for hi in range(h_max + 1):
            
            mass_explained_ch = mass_explained_c + hi * mass_H
            o_max = int((mz_max - mass_explained_ch) / mass_O)
            o_min = int((mz_min - mass_explained_ch) / mass_O)
           
            for oi in range(o_min, o_max + 1):
                
                mass = mass_explained_ch + oi * mass_O
                
                if mz_min <= mass <= mz_max:
                    yield mass, ci, hi, oi


for mass, ci, hi, oi in formulas_in_range(100, 100.1):
    mf = format_mf(ci, hi, oi)
    print("{:15s} {:f}".format(mf, mass))
H4O6            100.000790
CH8O5           100.037175
C2H12O4         100.073560
C4H4O3          100.016045
C5H8O2          100.052430
C6H12O          100.088815
C8H4            100.031300
# same checks as above

mass_C = 12.0
mass_H = 1.0078250319
mass_O = 15.994915


print(list(formulas_in_range_optimized(mass_H, mass_H)))
print(list(formulas_in_range_optimized(mass_C, mass_C)))
print(list(formulas_in_range_optimized(mass_O, mass_O)))
[(1.0078250319, 0, 1, 0)]
[(12.0, 1, 0, 0)]
[(15.994915, 0, 0, 1)]

Some benchmarks

# quick demo time module:

import time
print("seconds since 1st jan 1970 is", time.time())

started = time.time()
time.sleep(1.2)
print("sleep needed {} seconds".format(time.time() - started))
seconds since 1st jan 1970 is 1510231965.14184
sleep needed 1.200575828552246 seconds

We run the function multiple times for multiple mz bounds. For proper measurement we use median of multiple time measurements to compensate the influence of background computations on your machine.

Exercise: also compute the std deviations !

# compare methods

import time

# since Python 3.4, see https://docs.python.org/3/library/statistics.html:
import statistics


def measure_time(mf_generator, mz_min, mz_max, n_runs=7):
    """this function helps us to avoid code
    duplication for benchmarking different generators"""
    
    times = []
    for __ in range(n_runs):
        started = time.time()
        
        # we must use list here to exhaust the generator !
        result = list(mf_generator(mz_min, mz_max))
        
        needed = time.time() - started
        times.append(needed)
    
    return len(result), statistics.median(times)


for mz_min, mz_max in [(100, 100.1), (100, 101), (500, 501), (1000, 1000.1), (1000, 1001)]:

    n_naive, t_naive = measure_time(formulas_in_range, mz_min, mz_max)
    n_optimized, t_optimized = measure_time(formulas_in_range_optimized, mz_min, mz_max)
    
    # another defensive check to be sure that our optimization did not discard 
    # results:
    assert n_naive == n_optimized
    
    print("found {} formulas in range {}..{}".format(n_naive, mz_min, mz_max))
    print("naive method needs {:.2e} seconds, optimized needs {:.2e} seconds".format(t_naive, t_optimized))
    print("speed up is {:.2f}".format(t_naive / t_optimized))
    print()
found 7 formulas in range 100..100.1
naive method needs 2.21e-03 seconds, optimized needs 4.66e-04 seconds
speed up is 4.74

found 34 formulas in range 100..101
naive method needs 2.78e-03 seconds, optimized needs 4.79e-04 seconds
speed up is 5.81

found 685 formulas in range 500..501
naive method needs 1.93e-01 seconds, optimized needs 1.09e-02 seconds
speed up is 17.74

found 285 formulas in range 1000..1000.1
naive method needs 1.41e+00 seconds, optimized needs 4.22e-02 seconds
speed up is 33.40

found 2665 formulas in range 1000..1001
naive method needs 1.46e+00 seconds, optimized needs 4.30e-02 seconds
speed up is 33.91