root/imbot/Cleaner.py

Revision 116, 5.3 kB (checked in by njohri2, 3 years ago)

Finaler version

Line 
1 # modules that come with Python
2 import re
3
4 # third party dependencies
5 from nltk import word_tokenize
6 import yaml
7
8 (i_word, i_tag) = (0, 1)
9
10 class Cleaner:
11         def __init__(self, log):
12                 self.log = log
13                 self.load()
14                 self.end = ("end", "end")
15                
16                 # formerly re.compile(r"(?i)(((\b\w\.|Mrs?\.|B\.C\.|A\.D\.)|St\.|.)*?[\.\?!;])")
17                 self.sentences_regexp = re.compile(r"(?i)((\b\w\.|Mrs?\.|\w\.?\w\.|.)*?[\.\?!;])")
18        
19         def load(self):
20                 ''' Reads the latest *.yml files into memory '''
21                 self.html = self.read_file('filters/translate html to english.yml')
22                 self.internet = self.read_file('filters/translate from internet lingo.yml')
23                 self.spelling = self.read_file('filters/fix spelling mistakes.yml')
24                 self.swap = self.read_file('filters/modify persons.yml')
25        
26         def simplify(self, text):
27                 ''' Returns a lowercase version of 'text' without any spaces in it '''
28                 return text.replace(' ', '').lower()
29        
30         def read_file(self, file_name):
31                 ''' Returns the contents of a new-line-delimited text file as a list '''
32                 self.log.enter(text="Loading %s" % file_name, locals=locals())
33                 myfile = file(file_name, 'r')
34                 contents = yaml.load(myfile.read())
35                 myfile.close()
36                
37                 self.log.leave()
38                 return contents
39        
40         def translate_from_html_to_english(self, text):
41                 ''' Returns a version of 'text' where HTML codes have been replaced with human-readable text '''
42                 self.log.enter(text="input text: %s" % text, locals=locals())
43                
44                 for first, second in self.html:
45                         text = re.sub(r'(?i)%s' % re.escape(first), r'_%s_' % second, text)
46                         self.log.add("\t%s: %s --> %s" % (text, first, second))
47                
48                 self.log.leave("output text: %s" % text)
49                 return text
50        
51         def translate_from_internet_lingo(self, text):
52                 ''' Returns a version of 'text' where Internet lingo has been translated to English '''
53                 self.log.enter(text="input text: %s" % text, locals=locals())
54                
55                 for internet, english in self.internet:
56                         text = re.sub(r'(?i)\b%s\b' % re.escape(internet), r'_%s_' % english, text)
57                         self.log.add("\t%s: %s --> %s" % (text, internet, english))
58                
59                 self.log.leave()
60                 return re.sub(r'\b_(.*?)_\b', r'\1', text)
61        
62         def fix_spelling_mistakes(self, text):
63                 ''' Returns a version of 'text' where spelling mistakes have been corrected '''
64                 self.log.enter(text="input text: %s" % text, locals=locals())
65                
66                 for key, value in self.spelling:
67                         text = re.sub(r'(?i)\b%s\b' % re.escape(key), r'_%s_' % value, text)
68                         self.log.add("\t%s: %s --> %s" % (text, key, value))
69                
70                 self.log.leave()
71                 return re.sub(r'\b_(.*?)_\b', r'\1', text)
72        
73         def modify_persons(self, text, their_name):
74                 '''
75                 Returns a version of 'text' where the meanings of words like "you"
76                 change when viewed from the perspective of the other person
77                 '''
78                 self.log.enter(text="input text: %s" % text, locals=locals())
79                
80                 # replace all occurrences of "I" with the screenname of the person who's talking
81                 text = re.sub(r'(?i)\b(I|yourself)\b', r'_%s_' % their_name, text)
82                
83                 for first, second in self.swap:
84                         text = re.sub(r'(?i)\b%s\b' % re.escape(first), r'_%s_' % second, text)
85                         text = re.sub(r'(?i)\b%s\b' % re.escape(second), r'_%s_' % first, text)
86                         self.log.add("\t%s: %s <-> %s" % (text, first, second))
87                 text = re.sub(r'\b_(.*?)_\b', r'\1', text)
88                
89                 self.log.leave("output text: %s" % text)
90                 return text
91        
92         def clean_sentence(self, sentence, their_name, toggle):
93                 '''
94                 Cleans 'sentence' and returns it as a list of (word, tag) tuples
95                         sentence:      the sentence to clean
96                         their_name:    the person who will eventually receive the sentence
97                         toggle:        a dictionary whose keys enable/disable specific cleaning features
98                 '''
99                 self.log.enter(text="input sentence: %s" % sentence, locals=locals())
100                
101                 sentence = sentence.lower()
102                
103                 # clean up the sentence a little
104                 if toggle["html"]:       sentence = self.translate_from_html_to_english(sentence)
105                 if toggle["internet"]:   sentence = self.translate_from_internet_lingo(sentence)
106                 if toggle["spelling"]:   sentence = self.fix_spelling_mistakes(sentence)
107                 if toggle["swap"]:       sentence = self.modify_persons(sentence, their_name)
108                
109                 # tokenize the sentence
110                 sentence = word_tokenize(sentence)
111                
112                 self.log.leave("output sentence: %s" % sentence)
113                 return sentence
114        
115         @staticmethod
116         def contains_gibberish(text):
117                 ''' Returns True if gibberish is found in the text, False otherwise '''
118                 newtext = text
119                
120                 # repeats of anything 3 or more times
121                 search1 = re.search(r'\s(\S+\s+?)\1\1', re.escape(text))
122                 search2 = re.search(r'\s((\S+\s+?){4}).*\1.*\1', text)
123                 if search1 != None:
124                         print "search1:", search1.groups()
125                         return True
126                
127                 if search2 != None:
128                         print "search2:", search2.groups()
129                         return True
130                
131                 newtext = re.sub('(?i)[aeiouy][aeiouy]{3,}', 'GIBBERISH', newtext)
132                 newtext = re.sub('(?i)[bcdfghjklmnpqsvwxz][bcdfghjklmnpqrsvwxz]{3,}', 'GIBBERISH', newtext)
133                 newtext = re.sub('(?i)([aeiouy][bcdfghjklmnpqrstvwxz]){4,}', 'GIBBERISH', newtext)
134                 newtext = re.sub('(?i)q[^u]\w|sdf|fgh|hjk|bnm|cvb', 'GIBBERISH', newtext)
135                
136                 if text != newtext:
137                         print "BEFORE:", text
138                         print "AFTER:", newtext
139                         return True
140                 else:
141                         return False
142        
143         @staticmethod
144         def contains_mostly_symbols(message):
145                 symbols_only = re.sub('(?i)[\w\s]', '', r"%s" % message)
146                 letters_only = re.sub('(?i)[\`\~\!\@\#\$\%\^\&\*\(\)\_\+\-\=\0\\\[\]\{\}\;\:\'\"\,\<\.\>\/\?]', '', r"%s" % message)
147                
148                 return len(letters_only) < len(symbols_only)
149
Note: See TracBrowser for help on using the browser.