import requests
encoded = requests.get("https://siscourses.ethz.ch/python_dbiol/data/encoded.txt").text
print(encoded[:100])
DEFAULT_ORDER = "etaonhisrldwugfycbmkpvjqxz"
Three alternatives to create character histogram:
def symbol_histogram(text):
"""uses dict to compute mapping symbol -> number of occurences"""
histogram = {}
for symbol in text:
if symbol not in histogram:
histogram[symbol] = 0
histogram[symbol] += 1
return histogram
from collections import defaultdict
def symbol_histogram_2(text):
"""uses defatultdict"""
histogram = defaultdict(int)
for symbol in text:
histogram[symbol] += 1
return histogram
from collections import Counter
def symbol_histogram_3(text):
"""uses Counter"""
return Counter(text)
assert symbol_histogram_3(encoded) == symbol_histogram(encoded)
import pprint
pprint.pprint(symbol_histogram(encoded))
def rank_characters(text):
histogram = symbol_histogram(text)
# invert tuples:
counts = [(count, symbol) for symbol, count in histogram.items()]
# sorting tuples: first entry counts, then second entry:
counts.sort(reverse=True)
# unpack symbols from list of tuples and join them:
return "".join([symbol for (__, symbol) in counts])
encoded_characters = encoded.replace("\n", "").replace(" ", "")
print(rank_characters(encoded_characters))
def compute_mapping(characters, english_order=DEFAULT_ORDER):
ranked = rank_characters(characters)
mapping = dict(zip(ranked, english_order))
return mapping
print(compute_mapping(encoded_characters))
def decypher(encoded, english_order=DEFAULT_ORDER):
characters = encoded.replace(" ", "").replace("\n", "")
mapping = compute_mapping(characters, english_order)
decoded = [mapping.get(s, " ") for s in encoded]
return "".join(decoded)
decyphered = decypher(encoded)
print(decyphered[:100])
# look at the output, then swap n and h: (hole is nole, sitting is sittihg)
decyphered = decypher(encoded, "etaohnisrldwugfycbmkpvjqxz")
print(decyphered[:100])
# look at output, then swap f and y: (very / verf, of / oy)
decyphered = decypher(encoded, "etaohnisrldwugyfcbmkpvjqxz")
print(decyphered[:100])
# look at output, then swap p and k: (chapter / chakter)
decyphered = decypher(encoded, "etaohnisrldwugyfcbmpkvjqxz")
print(decyphered)