import re, sys from pprint import pprint import operator from termcolor import cprint import itertools RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' GLAS_LETTERS = 'аеёиоуыэюя' SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь' RU_FREQ = { 'о': 0.10983, 'е': 0.08483, 'а': 0.07998, 'и': 0.07367, 'н': 0.067, 'т': 0.06318, 'с': 0.05473, 'р': 0.04746, 'в': 0.04533, 'л': 0.04343, 'к': 0.03486, 'м': 0.03203, 'д': 0.02977, 'п': 0.02804, 'у': 0.02615, 'я': 0.02001, 'ы': 0.01898, 'ь': 0.01735, 'г': 0.01687, 'з': 0.01641, 'б': 0.01592, 'ч': 0.0145, 'й': 0.01208, 'х': 0.00966, 'ж': 0.0094, 'ш': 0.00718, 'ю': 0.00639, 'ц': 0.00486, 'щ': 0.00361, 'э': 0.00331, 'ф': 0.00267, 'ъ': 0.00037, 'ё': 0.00013 } ENG_FREQ = { 'A': 8.1, 'B': 1.4, 'C': 2.7, 'D': 3.9, 'E': 13.0, 'F': 2.9, 'G': 2.0, 'H': 5.2, 'I': 6.5, 'J': 0.2, 'K': 0.4, 'L': 3.4, 'M': 2.5, 'N': 7.2, 'O': 7.9, 'P': 2.0, 'R': 6.9, 'S': 6.1, 'T': 10.5, 'U': 2.4, 'V': 0.9, 'W': 1.5, 'X': 0.2, 'Y': 1.9, 'Z': 0.1, } for k, v in ENG_FREQ.items(): ENG_FREQ[k] = v/100 BF_NAMES = [ 'марширующе', 'марширующий', 'свободин', 'мовсаев', 'щиголев', 'щиголёв', ] def plural(n, words): words = words.split(' ') return words[0 if n == 1 else 1] # split text to sentences def split_sen(s): s = s.strip() lines = [] endings = ('. ', '? ', '! ') pos = 0 while pos < len(s): min_index = None for end in endings: try: i = s.index(end, pos) except ValueError: continue if min_index == None or i < min_index: min_index = i if min_index: line = s[pos:min_index+1] pos = min_index+2 else: line = s[pos:] pos = len(s) lines.append(line) result_lines = [] for line in lines: words = re.split(r'\s+', line) buf = [] for w in words: if not re.sub(r'[\.\!\?]', '', w).isdigit(): buf.append(w) else: if len(buf): result_lines.append(' '.join(buf)) result_lines.append(w) buf = [] if len(buf): result_lines.append(' '.join(buf)) return result_lines def analyze_sentences(lines): max_len = 0 for line in lines: if len(line) > max_len: max_len = len(line) max_len += 1 i = 1 for line in lines: words = re.split(r'\s+', line) cprint('%2d. ' % i, 'cyan', end='') print(line, end='') if len(line) < max_len: print(' ' * (max_len-len(line)), end='') cprint(str(len(words)), 'green', attrs=['bold'], end='') cprint(' %s,' % plural(len(words), 'word words'), 'green', end='') cprint(' %d' % len(line), 'yellow', attrs=['bold'], end='') cprint('/', 'yellow', end='') cprint('%d' % len(line.replace(' ', '')), 'yellow', attrs=['bold'], end='') cprint(' chars ', 'yellow', end='') unique = unique_letters_amount(line) cprint('(', 'red', end='') cprint(unique, 'red', attrs=['bold'], end='') cprint(' unique)', 'red') i += 1 cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='') def analyze_letters_frequency(s): letters = [] s = s.lower() s = s.replace(' ', '').replace('.', '').replace('?', '').replace(',', '').replace('!', '') s = s.replace('ё', 'е').replace('ъ', 'ь') s = s.replace("'", '') added = [] for l in s: if l not in added: freq = s.count(l) / len(s) letters.append(( l, freq )) added.append(l) letters = sorted(letters, key=operator.itemgetter(1), reverse=True) repl_table = {} in_table = [] for l in letters: l, freq = l closest_delta = 1 closest_letter = '?' closest_tbl_freq = 0 for tbl_l, tbl_freq in RU_FREQ.items(): delta = abs(tbl_freq - freq) if delta < closest_delta and tbl_l not in in_table: closest_delta = delta closest_letter = tbl_l closest_tbl_freq = tbl_freq repl_table[l] = closest_letter in_table.append(closest_letter) print('%s (%f) ---> %s (%f)' % (l, freq, closest_letter, closest_tbl_freq)) return repl_table def print_hl(s, tohl, end="\n"): i = 0 indexes = [] for c in s: if c.lower() != tohl.lower(): print(c, end='') else: indexes.append(i) cprint(c, 'red', attrs=['bold', 'underline'], end='') i += 1 print('', end=end) return indexes def rot_en(s): alphabet = '' for i in range(65, 91): alphabet += chr(i) s = s.upper() for i in range(0, 26): for letter in s: if letter not in alphabet: print(letter, end='') continue letter_index = alphabet.index(letter) new_index = (letter_index + i) % 26 new_letter = alphabet[new_index] print(new_letter, end='') print() def rot_ru(s): alphabet = RU_ALPHABET.upper() s = s.upper() for i in range(0, 33): for letter in s: if letter not in alphabet: print(letter, end='') continue letter_index = alphabet.index(letter) new_index = (letter_index + i) % 33 new_letter = alphabet[new_index] print(new_letter, end='') print() def bf_all(table): return list(itertools.product(*table)) def spaceitout(string,amount): amountint = int(amount) pile = "" for char in string: pile = pile + char + " "*amount return pile.strip() def unique_letters_amount(line): line = line.upper() unique = 0 for a in RU_ALPHABET.upper(): if a in line: unique += 1 return unique def bf_find_words(lines, words, nospaces=False): min_word_len = 100 for w in words: if len(w) < min_word_len: min_word_len = len(w) if nospaces: lines = list(map(lambda s: re.sub(r'\s+', '', s), lines)) for line_start in range(0, len(lines) - min_word_len + 1): cur_lines = lines[line_start:] for w in words: w = w.lower() if len(w) > len(cur_lines): continue found = True max_sen_len = 0 for i in range(0, len(w)): line = cur_lines[i].lower() if len(line) > max_sen_len: max_sen_len = len(line) if w[i] not in line: found = False break if found: # word found in - print('word ', end='') cprint(w, 'white', attrs=['bold', 'underline'], end='') print(' found in ', end='') cprint(line_start+1, 'white', attrs=['bold'], end='') print('-', end='') cprint(line_start+len(w), 'white', attrs=['bold'], end='') print(':') for i in range(0, len(w)): line = cur_lines[i] print(' ', end='') cprint('%2d. ' % (line_start+i+1), 'cyan', end='') indexes = print_hl(line, w[i], end='') if len(line) < max_sen_len: print(' ' * (max_sen_len - len(line)), end='') cprint(' %s. ' % (w[i]), 'cyan', end='') indexes = tuple(map(lambda n: str(n+1), indexes)) for i, n in enumerate(indexes): cprint(n, 'green', end='') if i < len(indexes) - 1: print(', ', end='') print() print()