diff options
author | rusinthread <rusinthread@cock.li> | 2017-02-14 22:09:18 +0300 |
---|---|---|
committer | rusinthread <rusinthread@cock.li> | 2017-02-14 22:09:18 +0300 |
commit | 720a66255c74287c9acae9095eeba3cbd4a8f6e4 (patch) | |
tree | 31b9ac958aab82f276ee1ba421a31919a1706f75 | |
parent | 12dedd4357890fffa11d5d7b138e0a919d2ab6d7 (diff) |
analyze letters freq
-rw-r--r-- | util.py | 94 |
1 files changed, 63 insertions, 31 deletions
@@ -8,6 +8,8 @@ RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэю GLAS_LETTERS = 'аеёиоуыэюя' SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь' +ENG_ALPHABET = 'abcdefghijklmnopqrstuvwxyz' + RU_FREQ = { 'о': 0.10983, 'е': 0.08483, @@ -132,7 +134,7 @@ def split_sen(s): return result_lines -def analyze_sentences(lines): +def analyze_sentences(lines, not_used=False): max_len = 0 for line in lines: if len(line) > max_len: @@ -168,47 +170,77 @@ def analyze_sentences(lines): cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='') + if not_used: + not_used_list = [] + s = ''.join(lines).lower() + for c in RU_ALPHABET: + if c not in s and c not in not_used_list: + not_used_list.append(c) + cprint('Not used letters: %s' % ', '.join(not_used_list), 'white', attrs=['bold']) -def analyze_letters_frequency(s): - letters = [] - s = s.lower() - s = s.replace(' ', '').replace('.', '').replace('?', '').replace(',', '').replace('!', '') - s = s.replace('ё', 'е').replace('ъ', 'ь') - s = s.replace("'", '') - added = [] - - for l in s: - if l not in added: - freq = s.count(l) / len(s) - letters.append(( l, freq )) - added.append(l) +class LetterFreq: + def __init__(self, letter, freq): + self.letter = letter + self.freq = freq + + def __repr__(self): + return '<LetterFreq of %s = %f>' % (self.letter, self.freq) + +def analyze_letters_frequency_type4(s_in, eng=False, source_eng=False, only_unique=False, print_result=True): + freqs = [] + added_letters = [] + repl = {} + + alphabet = RU_ALPHABET if not source_eng else ENG_ALPHABET + alphabet_freq = RU_FREQ if not eng else ENG_FREQ + + s = '' + for c in s_in.lower(): + if c == "'" or c in alphabet: + if c not in added_letters: + added_letters.append(c) + freqs.append(LetterFreq(c, 0)) + s += c - letters = sorted(letters, key=operator.itemgetter(1), reverse=True) + for lf in freqs: + lf.freq = s.count(lf.letter) / len(s) - repl_table = {} - in_table = [] + # sort by frequency + freqs = sorted(freqs, key=operator.attrgetter('freq'), reverse=True) - for l in letters: - l, freq = l - + already_found_letters = [] + + for lf in freqs: closest_delta = 1 closest_letter = '?' - closest_tbl_freq = 0 + closest_alphabet_freq = 0 - for tbl_l, tbl_freq in RU_FREQ.items(): - delta = abs(tbl_freq - freq) - if delta < closest_delta and tbl_l not in in_table: + for a_letter, a_freq in alphabet_freq.items(): + delta = abs(a_freq - lf.freq) + if delta < closest_delta and (a_letter not in already_found_letters or not only_unique): closest_delta = delta - closest_letter = tbl_l - closest_tbl_freq = tbl_freq - + closest_letter = a_letter + closest_alphabet_freq = a_freq + + repl[lf.letter] = closest_letter + already_found_letters.append(closest_letter) + + if print_result: + print('%s (%f) ---> %s (%f)' % (lf.letter, lf.freq, closest_letter, closest_alphabet_freq)) - repl_table[l] = closest_letter - in_table.append(closest_letter) - print('%s (%f) ---> %s (%f)' % (l, freq, closest_letter, closest_tbl_freq)) + return repl - return repl_table +# s - lowercase string +# repl - { from: to } dict +def replace_by_dict(s, repl): + buf = '' + for c in s: + if c in repl: + buf += repl[c] + else: + buf += c + return buf def print_hl(s, tohl, end="\n"): i = 0 |