summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrusinthread <rusinthread@cock.li>2017-02-14 22:09:18 +0300
committerrusinthread <rusinthread@cock.li>2017-02-14 22:09:18 +0300
commit720a66255c74287c9acae9095eeba3cbd4a8f6e4 (patch)
tree31b9ac958aab82f276ee1ba421a31919a1706f75
parent12dedd4357890fffa11d5d7b138e0a919d2ab6d7 (diff)
analyze letters freq
-rw-r--r--util.py94
1 files changed, 63 insertions, 31 deletions
diff --git a/util.py b/util.py
index 4add30c..fdefe51 100644
--- a/util.py
+++ b/util.py
@@ -8,6 +8,8 @@ RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэю
GLAS_LETTERS = 'аеёиоуыэюя'
SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь'
+ENG_ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
+
RU_FREQ = {
'о': 0.10983,
'е': 0.08483,
@@ -132,7 +134,7 @@ def split_sen(s):
return result_lines
-def analyze_sentences(lines):
+def analyze_sentences(lines, not_used=False):
max_len = 0
for line in lines:
if len(line) > max_len:
@@ -168,47 +170,77 @@ def analyze_sentences(lines):
cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='')
+ if not_used:
+ not_used_list = []
+ s = ''.join(lines).lower()
+ for c in RU_ALPHABET:
+ if c not in s and c not in not_used_list:
+ not_used_list.append(c)
+ cprint('Not used letters: %s' % ', '.join(not_used_list), 'white', attrs=['bold'])
-def analyze_letters_frequency(s):
- letters = []
- s = s.lower()
- s = s.replace(' ', '').replace('.', '').replace('?', '').replace(',', '').replace('!', '')
- s = s.replace('ё', 'е').replace('ъ', 'ь')
- s = s.replace("'", '')
- added = []
-
- for l in s:
- if l not in added:
- freq = s.count(l) / len(s)
- letters.append(( l, freq ))
- added.append(l)
+class LetterFreq:
+ def __init__(self, letter, freq):
+ self.letter = letter
+ self.freq = freq
+
+ def __repr__(self):
+ return '<LetterFreq of %s = %f>' % (self.letter, self.freq)
+
+def analyze_letters_frequency_type4(s_in, eng=False, source_eng=False, only_unique=False, print_result=True):
+ freqs = []
+ added_letters = []
+ repl = {}
+
+ alphabet = RU_ALPHABET if not source_eng else ENG_ALPHABET
+ alphabet_freq = RU_FREQ if not eng else ENG_FREQ
+
+ s = ''
+ for c in s_in.lower():
+ if c == "'" or c in alphabet:
+ if c not in added_letters:
+ added_letters.append(c)
+ freqs.append(LetterFreq(c, 0))
+ s += c
- letters = sorted(letters, key=operator.itemgetter(1), reverse=True)
+ for lf in freqs:
+ lf.freq = s.count(lf.letter) / len(s)
- repl_table = {}
- in_table = []
+ # sort by frequency
+ freqs = sorted(freqs, key=operator.attrgetter('freq'), reverse=True)
- for l in letters:
- l, freq = l
-
+ already_found_letters = []
+
+ for lf in freqs:
closest_delta = 1
closest_letter = '?'
- closest_tbl_freq = 0
+ closest_alphabet_freq = 0
- for tbl_l, tbl_freq in RU_FREQ.items():
- delta = abs(tbl_freq - freq)
- if delta < closest_delta and tbl_l not in in_table:
+ for a_letter, a_freq in alphabet_freq.items():
+ delta = abs(a_freq - lf.freq)
+ if delta < closest_delta and (a_letter not in already_found_letters or not only_unique):
closest_delta = delta
- closest_letter = tbl_l
- closest_tbl_freq = tbl_freq
-
+ closest_letter = a_letter
+ closest_alphabet_freq = a_freq
+
+ repl[lf.letter] = closest_letter
+ already_found_letters.append(closest_letter)
+
+ if print_result:
+ print('%s (%f) ---> %s (%f)' % (lf.letter, lf.freq, closest_letter, closest_alphabet_freq))
- repl_table[l] = closest_letter
- in_table.append(closest_letter)
- print('%s (%f) ---> %s (%f)' % (l, freq, closest_letter, closest_tbl_freq))
+ return repl
- return repl_table
+# s - lowercase string
+# repl - { from: to } dict
+def replace_by_dict(s, repl):
+ buf = ''
+ for c in s:
+ if c in repl:
+ buf += repl[c]
+ else:
+ buf += c
+ return buf
def print_hl(s, tohl, end="\n"):
i = 0