diff options
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | data_lib.py | 8 | ||||
-rw-r--r-- | util.py | 334 |
3 files changed, 344 insertions, 2 deletions
@@ -49,9 +49,11 @@ Скрипт `analyze_new.py` - черновик частотного анализа положения букв для расшифровки шифртекстов 3-го типа, тексты которых лежат в папке `new` (см. [data.md](./data.md) о типах). +В `util.py` полезные функции для анализа с красочным выводом. + ## Требования -Работает на Python 3. Рекомендуется установить модуль `termcolor` чтобы вывод был цветным. +Работает на Python 3. Рекомендуется установить модуль `termcolor` чтобы вывод был цветным (а util.py вообще не умеет без него работать). Для `search_2ch_archive.py` нужен модуль `requests`. diff --git a/data_lib.py b/data_lib.py index 5bf468e..fea488f 100644 --- a/data_lib.py +++ b/data_lib.py @@ -13,7 +13,7 @@ def _data_sort_date(i): return int(time.mktime(datetime.datetime.strptime(i['date'], '%d/%m/%y').timetuple())) # sort: 'len', 'date' -def load_data(sort='len', sort_reverse=False): +def load_data(sort='len', sort_reverse=False, date=None, type=None): with open(os.path.join(CWD, "data.json")) as f: data = json.loads(f.read()) @@ -28,8 +28,14 @@ def load_data(sort='len', sort_reverse=False): else: raise Error("Unknown sort type " + str(sort)) + # filter by date data = sorted(data, key=sort_f, reverse=sort_reverse) + if date: + data = list(filter(lambda i: 'date' in i and i['date'] == date, data)) + + if type: + data = list(filter(lambda i: 'type' in i and i['type'] == type, data)) return data @@ -0,0 +1,334 @@ +import re, sys +from pprint import pprint +import operator +from termcolor import cprint +import itertools + +RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' +GLAS_LETTERS = 'аеёиоуыэюя' +SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь' + +RU_FREQ = { + 'о': 0.10983, + 'е': 0.08483, + 'а': 0.07998, + 'и': 0.07367, + 'н': 0.067, + 'т': 0.06318, + 'с': 0.05473, + 'р': 0.04746, + 'в': 0.04533, + 'л': 0.04343, + 'к': 0.03486, + 'м': 0.03203, + 'д': 0.02977, + 'п': 0.02804, + 'у': 0.02615, + 'я': 0.02001, + 'ы': 0.01898, + 'ь': 0.01735, + 'г': 0.01687, + 'з': 0.01641, + 'б': 0.01592, + 'ч': 0.0145, + 'й': 0.01208, + 'х': 0.00966, + 'ж': 0.0094, + 'ш': 0.00718, + 'ю': 0.00639, + 'ц': 0.00486, + 'щ': 0.00361, + 'э': 0.00331, + 'ф': 0.00267, + 'ъ': 0.00037, + 'ё': 0.00013 +} + +ENG_FREQ = { + 'A': 8.1, + 'B': 1.4, + 'C': 2.7, + 'D': 3.9, + 'E': 13.0, + 'F': 2.9, + 'G': 2.0, + 'H': 5.2, + 'I': 6.5, + 'J': 0.2, + 'K': 0.4, + 'L': 3.4, + 'M': 2.5, + 'N': 7.2, + 'O': 7.9, + 'P': 2.0, + 'R': 6.9, + 'S': 6.1, + 'T': 10.5, + 'U': 2.4, + 'V': 0.9, + 'W': 1.5, + 'X': 0.2, + 'Y': 1.9, + 'Z': 0.1, +} +for k, v in ENG_FREQ.items(): + ENG_FREQ[k] = v/100 + + +BF_NAMES = ( + 'марширующе', + 'марширующий', + 'свободин', + 'мовсаев', + 'щиголев', + 'щиголёв', +) + +def plural(n, words): + words = words.split(' ') + return words[0 if n == 1 else 1] + +# split text to sentences +def split_sen(s): + s = s.strip() + lines = [] + + endings = ('. ', '? ', '! ') + + pos = 0 + while pos < len(s): + min_index = None + for end in endings: + try: + i = s.index(end, pos) + except ValueError: + continue + if min_index == None or i < min_index: + min_index = i + + if min_index: + line = s[pos:min_index+1] + pos = min_index+2 + else: + line = s[pos:] + pos = len(s) + + lines.append(line) + + result_lines = [] + for line in lines: + words = re.split(r'\s+', line) + buf = [] + for w in words: + if not re.sub(r'[\.\!\?]', '', w).isdigit(): + buf.append(w) + else: + if len(buf): + result_lines.append(' '.join(buf)) + result_lines.append(w) + buf = [] + if len(buf): + result_lines.append(' '.join(buf)) + + return result_lines + +def analyze_sentences(lines): + max_len = 0 + for line in lines: + if len(line) > max_len: + max_len = len(line) + + max_len += 1 + + i = 1 + for line in lines: + words = re.split(r'\s+', line) + + cprint('%2d. ' % i, 'cyan', end='') + + print(line, end='') + if len(line) < max_len: + print(' ' * (max_len-len(line)), end='') + + cprint(str(len(words)), 'green', attrs=['bold'], end='') + cprint(' %s,' % plural(len(words), 'word words'), 'green', end='') + + cprint(' %d' % len(line), 'yellow', attrs=['bold'], end='') + cprint('/', 'yellow', end='') + cprint('%d' % len(line.replace(' ', '')), 'yellow', attrs=['bold'], end='') + + cprint(' chars ', 'yellow', end='') + + unique = unique_letters_amount(line) + cprint('(', 'red', end='') + cprint(unique, 'red', attrs=['bold'], end='') + cprint(' unique)', 'red') + + i += 1 + + cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='') + + +def analyze_letters_frequency(s): + letters = [] + s = s.lower() + s = s.replace(' ', '').replace('.', '').replace('?', '').replace(',', '').replace('!', '') + s = s.replace('ё', 'е').replace('ъ', 'ь') + s = s.replace("'", '') + + added = [] + + for l in s: + if l not in added: + freq = s.count(l) / len(s) + letters.append(( l, freq )) + added.append(l) + + letters = sorted(letters, key=operator.itemgetter(1), reverse=True) + + repl_table = {} + in_table = [] + + for l in letters: + l, freq = l + + closest_delta = 1 + closest_letter = '?' + closest_tbl_freq = 0 + + for tbl_l, tbl_freq in RU_FREQ.items(): + delta = abs(tbl_freq - freq) + if delta < closest_delta and tbl_l not in in_table: + closest_delta = delta + closest_letter = tbl_l + closest_tbl_freq = tbl_freq + + + repl_table[l] = closest_letter + in_table.append(closest_letter) + print('%s (%f) ---> %s (%f)' % (l, freq, closest_letter, closest_tbl_freq)) + + return repl_table + +def print_hl(s, tohl, end="\n"): + i = 0 + indexes = [] + for c in s: + if c != tohl: + print(c, end='') + else: + indexes.append(i) + cprint(c, 'red', attrs=['bold', 'underline'], end='') + i += 1 + print('', end=end) + return indexes + +def rot_en(s): + alphabet = '' + for i in range(65, 91): + alphabet += chr(i) + + s = s.upper() + for i in range(0, 26): + for letter in s: + if re.match('[\d\.,\_\- ]', letter): + print(letter, end='') + continue + + letter_index = alphabet.index(letter) + new_index = (letter_index + i) % 26 + new_letter = alphabet[new_index] + print(new_letter, end='') + + print() + +def rot_ru(s): + alphabet = RU_ALPHABET.upper() + + s = s.upper() + for i in range(0, 33): + for letter in s: + if re.match('[\d\.,\_\- ]', letter): + print(letter, end='') + continue + + letter_index = alphabet.index(letter) + new_index = (letter_index + i) % 33 + new_letter = alphabet[new_index] + print(new_letter, end='') + + print() + +def bf_all(table): + return list(itertools.product(*table)) + +def spaceitout(string,amount): + amountint = int(amount) + pile = "" + for char in string: + pile = pile + char + " "*amount + return pile.strip() + +def unique_letters_amount(line): + line = line.upper() + unique = 0 + for a in RU_ALPHABET.upper(): + if a in line: + unique += 1 + return unique + +def bf_find_words(lines, words, nospaces=False): + min_word_len = 100 + for w in words: + if len(w) < min_word_len: + min_word_len = len(w) + + if nospaces: + lines = list(map(lambda s: re.sub(r'\s+', '', s), lines)) + + for line_start in range(0, len(lines) - min_word_len + 1): + cur_lines = lines[line_start:] + for w in words: + w = w.lower() + if len(w) > len(cur_lines): + continue + + found = True + max_sen_len = 0 + for i in range(0, len(w)): + line = cur_lines[i].lower() + if len(line) > max_sen_len: + max_sen_len = len(line) + if w[i] not in line: + found = False + break + + if found: + # word <word> found in <n>-<m> + print('word ', end='') + cprint(w, 'white', attrs=['bold', 'underline'], end='') + print(' found in ', end='') + cprint(line_start+1, 'white', attrs=['bold'], end='') + print('-', end='') + cprint(line_start+len(w), 'white', attrs=['bold'], end='') + print(':') + + for i in range(0, len(w)): + line = cur_lines[i] + print(' ', end='') + cprint('%2d. ' % (line_start+i+1), 'cyan', end='') + indexes = print_hl(line, w[i], end='') + if len(line) < max_sen_len: + print(' ' * (max_sen_len - len(line)), end='') + + cprint(' %s. ' % (w[i]), 'cyan', end='') + + indexes = tuple(map(lambda n: str(n+1), indexes)) + for i, n in enumerate(indexes): + cprint(n, 'green', end='') + if i < len(indexes) - 1: + print(', ', end='') + + print() + + print() |