import re, sys from pprint import pprint import operator from termcolor import cprint import itertools RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' GLAS_LETTERS = 'аеёиоуыэюя' SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь' ENG_ALPHABET = 'abcdefghijklmnopqrstuvwxyz' RU_FREQ = { 'о': 0.10983, 'е': 0.08483, 'а': 0.07998, 'и': 0.07367, 'н': 0.067, 'т': 0.06318, 'с': 0.05473, 'р': 0.04746, 'в': 0.04533, 'л': 0.04343, 'к': 0.03486, 'м': 0.03203, 'д': 0.02977, 'п': 0.02804, 'у': 0.02615, 'я': 0.02001, 'ы': 0.01898, 'ь': 0.01735, 'г': 0.01687, 'з': 0.01641, 'б': 0.01592, 'ч': 0.0145, 'й': 0.01208, 'х': 0.00966, 'ж': 0.0094, 'ш': 0.00718, 'ю': 0.00639, 'ц': 0.00486, 'щ': 0.00361, 'э': 0.00331, 'ф': 0.00267, 'ъ': 0.00037, 'ё': 0.00013 } ENG_FREQ = { 'A': 8.1, 'B': 1.4, 'C': 2.7, 'D': 3.9, 'E': 13.0, 'F': 2.9, 'G': 2.0, 'H': 5.2, 'I': 6.5, 'J': 0.2, 'K': 0.4, 'L': 3.4, 'M': 2.5, 'N': 7.2, 'O': 7.9, 'P': 2.0, 'R': 6.9, 'S': 6.1, 'T': 10.5, 'U': 2.4, 'V': 0.9, 'W': 1.5, 'X': 0.2, 'Y': 1.9, 'Z': 0.1, } for k, v in ENG_FREQ.items(): ENG_FREQ[k] = v/100 BF_NAMES = [ 'марширующе', 'марширующий', 'свободин', 'мовсаев', 'щиголев', 'щиголёв', ] CTHULHU_DICT = [ "ah", #generic action, e.g. greet, eat, do "'ai", #speak / call "athg", #sign (contract) / agree to "'bthnk", #body / essence "bug", #go #"c- (prefix) we / our "ch'", # cross over / travel "chtenff", # brotherhood / society "ebumna", # pit "ee", # answers "ehye", # cohesion / integrity "ep", # after; with "hai", # later / then #"f'- (prefix) they / their "'fhalma", # mother "fhtagn", # wait / sleep "fm'latgh", # burn "ftaghu", # skin / boundary "geb", # here "gnaiih", # father "gof'nn", # children "goka", # grant "gotha", # wish "grah'n", # lost one / larva #"h'- (prefix) it / its "hafh'drn", # priest / summoner "hai", # now "hlirgh", # heretic "hrii", # followers "hupadgh", # born of "ilyaa", # expect / await "k'yarnak", # share / exchange "kadishtu", # understand / know "kn'a", # question "li'hee", # on pain of "llll", # at / beside "lloig", # mind / psyche "lw'nafh", # dream / transmit "mg", # (conjunction) yet "mnahn'", # worthless "n'gha", # death "n'ghft", # darkness #"na- (prefix) (contraction of nafl-) #"nafl- (prefix) not / (not-present tense) #"ng- (prefix) (conjunction) and / then "nglui", # threshold "nilgh'ri", # anything / everything #"nnn- (prefix) watch / protect "nog", # come "nw", # head / place #"-nyth (suffix) servant of #"-og (suffix) (emphatic) "ooboshu", # visit #"-or (suffix) force from / aspect of "orr'e", # soul / spirit #"-oth (suffix) native of #"ph'- (prefix) over / beyond "phlegeth", # realm of information "r'luh", # secret / hidden "ron", # religion / cult "s'uhn", # pact "sgn'wahl", # share space "shagg", # realm of dreams "shogg", # realm of darkness "shtunggli", # notify / contact "shugg", # realm of Earth "sll'ha", # invite "stell'bsna", # ask / pray for "syha'h", # eternity "tharanak", # promise / bring "throd", # tremble "uaaah", # (finish spell) "uh'e", # people / crowd "uln", # call / summon "vulgtlagln", # pray to "vulgtm", # prayer "wgah'n", # reside in / control "y'hah", # amen #"y- (prefix) I / my "ya", # I #"-yar (suffix) time of / moment "zhro", # (lift spell)' ] def plural(n, words): words = words.split(' ') return words[0 if n == 1 else 1] # split text to sentences def split_sen(s, smart=True): s = s.strip() lines = [] endings = ('. ', '? ', '! ') pos = 0 while pos < len(s): min_index = None for end in endings: try: i = s.index(end, pos) except ValueError: continue if min_index == None or i < min_index: min_index = i if min_index: line = s[pos:min_index+1] pos = min_index+2 else: line = s[pos:] pos = len(s) lines.append(line) if not smart: return lines result_lines = [] for line in lines: words = re.split(r'\s+', line) buf = [] for w in words: if not re.sub(r'[\.\!\?\\/]', '', w).isdigit(): buf.append(w) else: if len(buf): result_lines.append(' '.join(buf)) result_lines.append(w) buf = [] if len(buf): result_lines.append(' '.join(buf)) return result_lines def analyze_sentences(lines, not_used=False): max_len = 0 for line in lines: if len(line) > max_len: max_len = len(line) max_len += 1 i = 1 for line in lines: words = re.split(r'\s+', line) cprint('%2d. ' % i, 'cyan', end='') print(line, end='') if len(line) < max_len: print(' ' * (max_len-len(line)), end='') cprint(str(len(words)), 'green', attrs=['bold'], end='') cprint(' %s,' % plural(len(words), 'word words'), 'green', end='') cprint(' %d' % len(line), 'yellow', attrs=['bold'], end='') cprint('/', 'yellow', end='') cprint('%d' % len(line.replace(' ', '')), 'yellow', attrs=['bold'], end='') cprint(' chars ', 'yellow', end='') unique = unique_letters_amount(line) cprint('(', 'red', end='') cprint(unique, 'red', attrs=['bold'], end='') cprint(' unique)', 'red') i += 1 cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='') if not_used: not_used_list = [] s = ''.join(lines).lower() for c in RU_ALPHABET: if c not in s and c not in not_used_list: not_used_list.append(c) cprint('Not used letters: %s' % ', '.join(not_used_list), 'white', attrs=['bold']) class LetterFreq: def __init__(self, letter, freq): self.letter = letter self.freq = freq def __repr__(self): return '' % (self.letter, self.freq) def analyze_letters_frequency_type4(s_in, eng=False, source_eng=False, only_unique=False, print_result=True): freqs = [] added_letters = [] repl = {} alphabet = RU_ALPHABET if not source_eng else ENG_ALPHABET alphabet_freq = RU_FREQ if not eng else ENG_FREQ s = '' for c in s_in.lower(): if c == "'" or c in alphabet: if c not in added_letters: added_letters.append(c) freqs.append(LetterFreq(c, 0)) s += c for lf in freqs: lf.freq = s.count(lf.letter) / len(s) # sort by frequency freqs = sorted(freqs, key=operator.attrgetter('freq'), reverse=True) already_found_letters = [] for lf in freqs: closest_delta = 1 closest_letter = '?' closest_alphabet_freq = 0 for a_letter, a_freq in alphabet_freq.items(): delta = abs(a_freq - lf.freq) if delta < closest_delta and (a_letter not in already_found_letters or not only_unique): closest_delta = delta closest_letter = a_letter closest_alphabet_freq = a_freq repl[lf.letter] = closest_letter already_found_letters.append(closest_letter) if print_result: print('%s (%f) ---> %s (%f)' % (lf.letter, lf.freq, closest_letter, closest_alphabet_freq)) return repl # s - lowercase string # repl - { from: to } dict def replace_by_dict(s, repl): buf = '' for c in s: if c in repl: buf += repl[c] else: buf += c return buf def print_hl(s, tohl, end="\n"): i = 0 indexes = [] for c in s: if c.lower() != tohl.lower(): print(c, end='') else: indexes.append(i) cprint(c, 'red', attrs=['bold', 'underline'], end='') i += 1 print('', end=end) return indexes def rot_en(s): alphabet = '' for i in range(65, 91): alphabet += chr(i) s = s.upper() for i in range(0, 26): for letter in s: if letter not in alphabet: print(letter, end='') continue letter_index = alphabet.index(letter) new_index = (letter_index + i) % 26 new_letter = alphabet[new_index] print(new_letter, end='') print() def rot_ru(s, return_list=False): alphabet = RU_ALPHABET.upper() result = [] s = s.upper() for i in range(0, 33): buf = '' for letter in s: if letter not in alphabet: buf += letter continue letter_index = alphabet.index(letter) new_index = (letter_index + i) % 33 new_letter = alphabet[new_index] buf += new_letter if not return_list: print(buf) else: result.append(buf) if return_list: return result def bf_all(table): return list(itertools.product(*table)) def spaceitout(string,amount): amountint = int(amount) pile = "" for char in string: pile = pile + char + " "*amount return pile.strip() def unique_letters_amount(line): line = line.upper() unique = 0 for a in RU_ALPHABET.upper(): if a in line: unique += 1 return unique def bf_find_words(lines, words, nospaces=False): min_word_len = 100 for w in words: if len(w) < min_word_len: min_word_len = len(w) if nospaces: lines = list(map(lambda s: re.sub(r'\s+', '', s), lines)) for line_start in range(0, len(lines) - min_word_len + 1): cur_lines = lines[line_start:] for w in words: w = w.lower() if len(w) > len(cur_lines): continue found = True max_sen_len = 0 for i in range(0, len(w)): line = cur_lines[i].lower() if len(line) > max_sen_len: max_sen_len = len(line) if w[i] not in line: found = False break if found: # word found in - print('word ', end='') cprint(w, 'white', attrs=['bold', 'underline'], end='') print(' found in ', end='') cprint(line_start+1, 'white', attrs=['bold'], end='') print('-', end='') cprint(line_start+len(w), 'white', attrs=['bold'], end='') print(':') for i in range(0, len(w)): line = cur_lines[i] print(' ', end='') cprint('%2d. ' % (line_start+i+1), 'cyan', end='') indexes = print_hl(line, w[i], end='') if len(line) < max_sen_len: print(' ' * (max_sen_len - len(line)), end='') cprint(' %s. ' % (w[i]), 'cyan', end='') indexes = tuple(map(lambda n: str(n+1), indexes)) for i, n in enumerate(indexes): cprint(n, 'green', end='') if i < len(indexes) - 1: print(', ', end='') print() print() class BFGrepDictionary: def __init__(self, lines, dict_file): def prepare_line(line): line = re.sub(r'[\.\!\?\s]', '', line) line = line.lower() line = list(set(line)) return line self.lines = list(map(prepare_line, lines)) self.lines_count = len(self.lines) self.dict_file = dict_file def go(self): self.walk(0, '') #bf_all(self.lines) def walk(self, start_line, buf): line = self.lines[start_line] for i in range(len(line)): letter = line[i] if start_line == 0 and letter == 'ы': continue if start_line > 0 and letter == buf[-1:]: continue if start_line + 1 <= self.lines_count - 1: self.walk(start_line + 1, buf + letter) else: self.check_word(buf + letter) def check_word(self, s): print(s) # def grep(): # cmd = 'cat /tmp/all.txt | grep --color=never "%s" | xargs' % word # #print(cmd) # result = subprocess.check_output(cmd, shell=True, cwd=CWD).strip().decode('utf8') # if result: # result = result.replace("\n", ' ') # return result.split(' ') # else: # return None