A simple Web crawler
import urllib
import urlparse
# Creates the initial dictionary. As they are discovered, URLs
# should be added as keys into this dictionary.
def create_found(start_url):
return { start_url : True } # replace this with your definition
# Returns a URL found from "queue", which is the list of unvisited URLs.
# The "found" parameter is the dictionary for deciding which URL to select.
def select_from_queue(queue, found):
return queue[0] # replace this with your definition
# Updates the dictionary "found" to reflect that a page has been loaded
# contains a link to "new_url".
def add_to_found(found, new_url):
found[new_url] = True # replace this with your definition
#
# You should not modify any of the below code for the assignment
#
# This string is the base URL to be loaded, and URLs will be
# added only when they lie within the same directory.
START_URL = 'http://ozark.hendrix.edu/~burch/cs/150/assn/12/friends/'
def crawl():
# The crawler uses two important structures for its data:
# queue will be a list of URLs that should be visited in the future
# found is a map whose keys represent URLs that have been discovered
start_url = START_URL + 'index.html'
queue = [ start_url ]
found = create_found(start_url)
visited = 0 # (We track # pages loaded so the crawler doesn't go crazy)
while visited < 15 and len(queue) > 0:
# determine the next URL to fetch
visited += 1
url = select_from_queue(queue, found)
queue.remove(url)
# now fetch it
conn = urllib.urlopen(url)
code = conn.getcode()
if code == 200:
text = conn.read()
else:
text = ''
conn.close()
if len(text) == 0:
print 'could not load', url, 'due to error code', code
else:
print 'loaded', url
# go through HTML looking for links to other pages
for new_url in find_urls(text, url):
if new_url.startswith(START_URL):
if new_url not in found:
queue.append(new_url)
add_to_found(found, new_url)
def find_urls(text, base_url):
result = []
next = text.find('href="')
while next >= 0:
quote = text.find('"', next + 6)
href = text[next + 6:quote]
url = build_url(base_url, href)
if url not in result:
result.append(url)
next = text.find('href="', next + 1)
return result
def build_url(base_url, href):
url = urlparse.urlparse(href)
if url.scheme == '':
url_str = urlparse.urljoin(base_url, href)
url = urlparse.urlparse(url_str)
return url.scheme + '://' + url.netloc + url.path
crawl()