| 1 |
|
|---|
| 2 |
import re |
|---|
| 3 |
|
|---|
| 4 |
|
|---|
| 5 |
from nltk import word_tokenize |
|---|
| 6 |
import yaml |
|---|
| 7 |
|
|---|
| 8 |
(i_word, i_tag) = (0, 1) |
|---|
| 9 |
|
|---|
| 10 |
class Cleaner: |
|---|
| 11 |
def __init__(self, log): |
|---|
| 12 |
self.log = log |
|---|
| 13 |
self.load() |
|---|
| 14 |
self.end = ("end", "end") |
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
self.sentences_regexp = re.compile(r"(?i)((\b\w\.|Mrs?\.|\w\.?\w\.|.)*?[\.\?!;])") |
|---|
| 18 |
|
|---|
| 19 |
def load(self): |
|---|
| 20 |
''' Reads the latest *.yml files into memory ''' |
|---|
| 21 |
self.html = self.read_file('filters/translate html to english.yml') |
|---|
| 22 |
self.internet = self.read_file('filters/translate from internet lingo.yml') |
|---|
| 23 |
self.spelling = self.read_file('filters/fix spelling mistakes.yml') |
|---|
| 24 |
self.swap = self.read_file('filters/modify persons.yml') |
|---|
| 25 |
|
|---|
| 26 |
def simplify(self, text): |
|---|
| 27 |
''' Returns a lowercase version of 'text' without any spaces in it ''' |
|---|
| 28 |
return text.replace(' ', '').lower() |
|---|
| 29 |
|
|---|
| 30 |
def read_file(self, file_name): |
|---|
| 31 |
''' Returns the contents of a new-line-delimited text file as a list ''' |
|---|
| 32 |
self.log.enter(text="Loading %s" % file_name, locals=locals()) |
|---|
| 33 |
myfile = file(file_name, 'r') |
|---|
| 34 |
contents = yaml.load(myfile.read()) |
|---|
| 35 |
myfile.close() |
|---|
| 36 |
|
|---|
| 37 |
self.log.leave() |
|---|
| 38 |
return contents |
|---|
| 39 |
|
|---|
| 40 |
def translate_from_html_to_english(self, text): |
|---|
| 41 |
''' Returns a version of 'text' where HTML codes have been replaced with human-readable text ''' |
|---|
| 42 |
self.log.enter(text="input text: %s" % text, locals=locals()) |
|---|
| 43 |
|
|---|
| 44 |
for first, second in self.html: |
|---|
| 45 |
text = re.sub(r'(?i)%s' % re.escape(first), r'_%s_' % second, text) |
|---|
| 46 |
self.log.add("\t%s: %s --> %s" % (text, first, second)) |
|---|
| 47 |
|
|---|
| 48 |
self.log.leave("output text: %s" % text) |
|---|
| 49 |
return text |
|---|
| 50 |
|
|---|
| 51 |
def translate_from_internet_lingo(self, text): |
|---|
| 52 |
''' Returns a version of 'text' where Internet lingo has been translated to English ''' |
|---|
| 53 |
self.log.enter(text="input text: %s" % text, locals=locals()) |
|---|
| 54 |
|
|---|
| 55 |
for internet, english in self.internet: |
|---|
| 56 |
text = re.sub(r'(?i)\b%s\b' % re.escape(internet), r'_%s_' % english, text) |
|---|
| 57 |
self.log.add("\t%s: %s --> %s" % (text, internet, english)) |
|---|
| 58 |
|
|---|
| 59 |
self.log.leave() |
|---|
| 60 |
return re.sub(r'\b_(.*?)_\b', r'\1', text) |
|---|
| 61 |
|
|---|
| 62 |
def fix_spelling_mistakes(self, text): |
|---|
| 63 |
''' Returns a version of 'text' where spelling mistakes have been corrected ''' |
|---|
| 64 |
self.log.enter(text="input text: %s" % text, locals=locals()) |
|---|
| 65 |
|
|---|
| 66 |
for key, value in self.spelling: |
|---|
| 67 |
text = re.sub(r'(?i)\b%s\b' % re.escape(key), r'_%s_' % value, text) |
|---|
| 68 |
self.log.add("\t%s: %s --> %s" % (text, key, value)) |
|---|
| 69 |
|
|---|
| 70 |
self.log.leave() |
|---|
| 71 |
return re.sub(r'\b_(.*?)_\b', r'\1', text) |
|---|
| 72 |
|
|---|
| 73 |
def modify_persons(self, text, their_name): |
|---|
| 74 |
''' |
|---|
| 75 |
Returns a version of 'text' where the meanings of words like "you" |
|---|
| 76 |
change when viewed from the perspective of the other person |
|---|
| 77 |
''' |
|---|
| 78 |
self.log.enter(text="input text: %s" % text, locals=locals()) |
|---|
| 79 |
|
|---|
| 80 |
|
|---|
| 81 |
text = re.sub(r'(?i)\b(I|yourself)\b', r'_%s_' % their_name, text) |
|---|
| 82 |
|
|---|
| 83 |
for first, second in self.swap: |
|---|
| 84 |
text = re.sub(r'(?i)\b%s\b' % re.escape(first), r'_%s_' % second, text) |
|---|
| 85 |
text = re.sub(r'(?i)\b%s\b' % re.escape(second), r'_%s_' % first, text) |
|---|
| 86 |
self.log.add("\t%s: %s <-> %s" % (text, first, second)) |
|---|
| 87 |
text = re.sub(r'\b_(.*?)_\b', r'\1', text) |
|---|
| 88 |
|
|---|
| 89 |
self.log.leave("output text: %s" % text) |
|---|
| 90 |
return text |
|---|
| 91 |
|
|---|
| 92 |
def clean_sentence(self, sentence, their_name, toggle): |
|---|
| 93 |
''' |
|---|
| 94 |
Cleans 'sentence' and returns it as a list of (word, tag) tuples |
|---|
| 95 |
sentence: the sentence to clean |
|---|
| 96 |
their_name: the person who will eventually receive the sentence |
|---|
| 97 |
toggle: a dictionary whose keys enable/disable specific cleaning features |
|---|
| 98 |
''' |
|---|
| 99 |
self.log.enter(text="input sentence: %s" % sentence, locals=locals()) |
|---|
| 100 |
|
|---|
| 101 |
sentence = sentence.lower() |
|---|
| 102 |
|
|---|
| 103 |
|
|---|
| 104 |
if toggle["html"]: sentence = self.translate_from_html_to_english(sentence) |
|---|
| 105 |
if toggle["internet"]: sentence = self.translate_from_internet_lingo(sentence) |
|---|
| 106 |
if toggle["spelling"]: sentence = self.fix_spelling_mistakes(sentence) |
|---|
| 107 |
if toggle["swap"]: sentence = self.modify_persons(sentence, their_name) |
|---|
| 108 |
|
|---|
| 109 |
|
|---|
| 110 |
sentence = word_tokenize(sentence) |
|---|
| 111 |
|
|---|
| 112 |
self.log.leave("output sentence: %s" % sentence) |
|---|
| 113 |
return sentence |
|---|
| 114 |
|
|---|
| 115 |
@staticmethod |
|---|
| 116 |
def contains_gibberish(text): |
|---|
| 117 |
''' Returns True if gibberish is found in the text, False otherwise ''' |
|---|
| 118 |
newtext = text |
|---|
| 119 |
|
|---|
| 120 |
|
|---|
| 121 |
search1 = re.search(r'\s(\S+\s+?)\1\1', re.escape(text)) |
|---|
| 122 |
search2 = re.search(r'\s((\S+\s+?){4}).*\1.*\1', text) |
|---|
| 123 |
if search1 != None: |
|---|
| 124 |
print "search1:", search1.groups() |
|---|
| 125 |
return True |
|---|
| 126 |
|
|---|
| 127 |
if search2 != None: |
|---|
| 128 |
print "search2:", search2.groups() |
|---|
| 129 |
return True |
|---|
| 130 |
|
|---|
| 131 |
newtext = re.sub('(?i)[aeiouy][aeiouy]{3,}', 'GIBBERISH', newtext) |
|---|
| 132 |
newtext = re.sub('(?i)[bcdfghjklmnpqsvwxz][bcdfghjklmnpqrsvwxz]{3,}', 'GIBBERISH', newtext) |
|---|
| 133 |
newtext = re.sub('(?i)([aeiouy][bcdfghjklmnpqrstvwxz]){4,}', 'GIBBERISH', newtext) |
|---|
| 134 |
newtext = re.sub('(?i)q[^u]\w|sdf|fgh|hjk|bnm|cvb', 'GIBBERISH', newtext) |
|---|
| 135 |
|
|---|
| 136 |
if text != newtext: |
|---|
| 137 |
print "BEFORE:", text |
|---|
| 138 |
print "AFTER:", newtext |
|---|
| 139 |
return True |
|---|
| 140 |
else: |
|---|
| 141 |
return False |
|---|
| 142 |
|
|---|
| 143 |
@staticmethod |
|---|
| 144 |
def contains_mostly_symbols(message): |
|---|
| 145 |
symbols_only = re.sub('(?i)[\w\s]', '', r"%s" % message) |
|---|
| 146 |
letters_only = re.sub('(?i)[\`\~\!\@\#\$\%\^\&\*\(\)\_\+\-\=\0\\\[\]\{\}\;\:\'\"\,\<\.\>\/\?]', '', r"%s" % message) |
|---|
| 147 |
|
|---|
| 148 |
return len(letters_only) < len(symbols_only) |
|---|
| 149 |
|
|---|