| 1 |
|
|---|
| 2 |
import os, re |
|---|
| 3 |
from mechanize import Browser |
|---|
| 4 |
from BeautifulSoup import BeautifulSoup |
|---|
| 5 |
|
|---|
| 6 |
class Scraper: |
|---|
| 7 |
def __init__(self, brain): |
|---|
| 8 |
self.brain = brain |
|---|
| 9 |
self.mech = Browser() |
|---|
| 10 |
|
|---|
| 11 |
def scrape_website(self, url): |
|---|
| 12 |
''' Returns a BeautifulSoup parse tree of the url provided ''' |
|---|
| 13 |
page = self.mech.open(url) |
|---|
| 14 |
html = page.read() |
|---|
| 15 |
return BeautifulSoup(html) |
|---|
| 16 |
|
|---|
| 17 |
def learn(self): |
|---|
| 18 |
''' Scrapes all available sources of data and adds them to the brain ''' |
|---|
| 19 |
print "Scraping data..." |
|---|
| 20 |
self.scrape_country_capitals() |
|---|
| 21 |
|
|---|
| 22 |
print "Done scraping!" |
|---|
| 23 |
print "Saving to disk..." |
|---|
| 24 |
self.brain.save() |
|---|
| 25 |
print "...Done!" |
|---|
| 26 |
|
|---|
| 27 |
def scrape_country_capitals(self): |
|---|
| 28 |
print "Scraping country capitals out of the CIA World Factbook" |
|---|
| 29 |
source = "The CIA" |
|---|
| 30 |
self.brain.their_history[self.brain.cleaner.simplify(source)] = [] |
|---|
| 31 |
url = "file:///" + re.sub(r'\\', '/', os.getcwd()) + "/Scrape%20Data/factbook/fields/2057.html" |
|---|
| 32 |
soup = self.scrape_website(url) |
|---|
| 33 |
table = soup.find("table", border=1) |
|---|
| 34 |
|
|---|
| 35 |
|
|---|
| 36 |
for row in table.findAll('tr')[1:]: |
|---|
| 37 |
|
|---|
| 38 |
col = row.findAll('td') |
|---|
| 39 |
country = col[0].a.string |
|---|
| 40 |
country = re.sub(r'(?m)\n', '', country) |
|---|
| 41 |
capital = re.search(r"<i>name:</i> ([\w '-]+)", str(col[1])) |
|---|
| 42 |
if capital == None: continue |
|---|
| 43 |
capital = capital.groups()[0] |
|---|
| 44 |
|
|---|
| 45 |
|
|---|
| 46 |
country = re.sub(r'\([^)]+\)', '', re.sub(r'(\w+), (.+)', r'\2 \1', country)).strip() |
|---|
| 47 |
|
|---|
| 48 |
|
|---|
| 49 |
capital = re.sub(r'(\(.*\)|;.*)', '', capital) |
|---|
| 50 |
|
|---|
| 51 |
|
|---|
| 52 |
sentences = ( |
|---|
| 53 |
"%s is the capital of %s" % (capital, country), |
|---|
| 54 |
"%s is the capital city of %s" % (capital, country), |
|---|
| 55 |
"the capital of %s is %s" % (country, capital), |
|---|
| 56 |
"the capital city of %s is %s" % (country, capital) |
|---|
| 57 |
) |
|---|
| 58 |
|
|---|
| 59 |
|
|---|
| 60 |
print country, '=', capital, |
|---|
| 61 |
for sentence in sentences: |
|---|
| 62 |
print '.', |
|---|
| 63 |
self.brain.parse_incoming_message(source, sentence) |
|---|
| 64 |
self.brain.their_history[self.brain.cleaner.simplify(source)] = [] |
|---|
| 65 |
print '' |
|---|
| 66 |
|
|---|
| 67 |
def scrape_country_histories(self): |
|---|
| 68 |
''' |
|---|
| 69 |
Scrapes the CIA World Factbook for short historical blurbs |
|---|
| 70 |
about all the countries in the world and adds them to our brain |
|---|
| 71 |
''' |
|---|
| 72 |
print "Scraping country histories out of the CIA World Factbook" |
|---|
| 73 |
source = "The CIA" |
|---|
| 74 |
self.brain.history[self.brain.cleaner.simplify(source)] = [] |
|---|
| 75 |
url = "file:///" + re.sub(r'\\', '/', os.getcwd()) + "/Scrape%20Data/factbook/fields/2028.html" |
|---|
| 76 |
soup = self.scrape_website(url) |
|---|
| 77 |
table = soup.find("table", border=1) |
|---|
| 78 |
|
|---|
| 79 |
|
|---|
| 80 |
for row in table.findAll('tr')[1:]: |
|---|
| 81 |
|
|---|
| 82 |
col = row.findAll('td') |
|---|
| 83 |
country = col[0].a.string |
|---|
| 84 |
background = str(col[1].string) |
|---|
| 85 |
if col[1].string == None: continue |
|---|
| 86 |
|
|---|
| 87 |
|
|---|
| 88 |
country = re.sub(r'\([^)]+\)', '', re.sub(r'(\w+), (.+)', r'\2 \1', country)).strip() |
|---|
| 89 |
|
|---|
| 90 |
|
|---|
| 91 |
sentences = [result[0].strip() for result in self.brain.cleaner.sentences_regexp.findall(background)] |
|---|
| 92 |
|
|---|
| 93 |
|
|---|
| 94 |
print country, |
|---|
| 95 |
for sentence in sentences: |
|---|
| 96 |
print '.', |
|---|
| 97 |
self.brain.parse_incoming_message(source, sentence) |
|---|
| 98 |
print '' |
|---|
| 99 |
|
|---|
| 100 |
|
|---|
| 101 |
self.brain.history[self.brain.cleaner.simplify(source)] = [] |
|---|
| 102 |
|
|---|