root/imbot/Scraper.py

Revision 97, 3.5 kB (checked in by njohri2, 3 years ago)

Updated Graph to handle single edges for distance graphs with hashes of distances.

Line 
1 #!/usr/bin/env python
2 import os, re
3 from mechanize import Browser
4 from BeautifulSoup import BeautifulSoup
5
6 class Scraper:
7         def __init__(self, brain):
8                 self.brain = brain
9                 self.mech = Browser()
10        
11         def scrape_website(self, url):
12                 ''' Returns a BeautifulSoup parse tree of the url provided '''
13                 page = self.mech.open(url)
14                 html = page.read()
15                 return BeautifulSoup(html)
16        
17         def learn(self):
18                 ''' Scrapes all available sources of data and adds them to the brain '''
19                 print "Scraping data..."
20                 self.scrape_country_capitals()
21                 #self.scrape_country_histories()
22                 print "Done scraping!"
23                 print "Saving to disk..."
24                 self.brain.save()
25                 print "...Done!"
26        
27         def scrape_country_capitals(self):
28                 print "Scraping country capitals out of the CIA World Factbook"
29                 source = "The CIA"
30                 self.brain.their_history[self.brain.cleaner.simplify(source)] = []
31                 url = "file:///" + re.sub(r'\\', '/', os.getcwd()) + "/Scrape%20Data/factbook/fields/2057.html"
32                 soup = self.scrape_website(url)
33                 table = soup.find("table", border=1)
34                
35                 # for every non-header table row
36                 for row in table.findAll('tr')[1:]:
37                         # parse table cells in this row
38                         col = row.findAll('td')
39                         country = col[0].a.string
40                         country = re.sub(r'(?m)\n', '', country)
41                         capital = re.search(r"<i>name:</i> ([\w '-]+)", str(col[1]))
42                         if capital == None: continue
43                         capital = capital.groups()[0]
44                        
45                         # reformat countries that have parenthesis or commas in them
46                         country = re.sub(r'\([^)]+\)', '', re.sub(r'(\w+), (.+)', r'\2 \1', country)).strip()
47                        
48                         # remove notes from country capitals
49                         capital = re.sub(r'(\(.*\)|;.*)', '', capital)
50                        
51                         # phrase the fact a few different ways
52                         sentences = (
53                                 "%s is the capital of %s" % (capital, country),
54                                 "%s is the capital city of %s" % (capital, country),
55                                 "the capital of %s is %s" % (country, capital),
56                                 "the capital city of %s is %s" % (country, capital)
57                         )
58                        
59                         # add each sentence to the brain
60                         print country, '=', capital,
61                         for sentence in sentences:
62                                 print '.',
63                                 self.brain.parse_incoming_message(source, sentence)
64                                 self.brain.their_history[self.brain.cleaner.simplify(source)] = []
65                         print ''
66        
67         def scrape_country_histories(self):
68                 '''
69                 Scrapes the CIA World Factbook for short historical blurbs
70                 about all the countries in the world and adds them to our brain
71                 '''
72                 print "Scraping country histories out of the CIA World Factbook"
73                 source = "The CIA"
74                 self.brain.history[self.brain.cleaner.simplify(source)] = []
75                 url = "file:///" + re.sub(r'\\', '/', os.getcwd()) + "/Scrape%20Data/factbook/fields/2028.html"
76                 soup = self.scrape_website(url)
77                 table = soup.find("table", border=1)
78                
79                 # for every non-header table row
80                 for row in table.findAll('tr')[1:]:
81                         # parse table cells in this row
82                         col = row.findAll('td')
83                         country = col[0].a.string
84                         background = str(col[1].string)
85                         if col[1].string == None: continue
86                        
87                         # reformat countries that have parenthesis or commas in them
88                         country = re.sub(r'\([^)]+\)', '', re.sub(r'(\w+), (.+)', r'\2 \1', country)).strip()
89                        
90                         # split the useful information into sentences
91                         sentences = [result[0].strip() for result in self.brain.cleaner.sentences_regexp.findall(background)]
92                        
93                         # add each sentence to the brain
94                         print country,
95                         for sentence in sentences:
96                                 print '.',
97                                 self.brain.parse_incoming_message(source, sentence)
98                         print ''
99                        
100                         # reset the history to prevent the next country's information from being associated with this one's
101                         self.brain.history[self.brain.cleaner.simplify(source)] = []
102
Note: See TracBrowser for help on using the browser.