# webcrawler.py: Performs a breadth-first or depth-first search of (a part # of) the World Wide Web, starting at some given web page, to solve some # problem of interest. # Author: Joshua R. Davis # Author: !! YOUR NAME HERE. # Date: !! # Given a string of HTML representing a web page, returns a list of strings, # namely all of the URLs that appear as links on the page. Warning: Some of # the URLs are relative, hence incomplete. To complete them, you need to # know the base href of the web page; I don't have code to do this. def links(string): urls = [] # This is done using a finite-state automaton with 6 states, 0 to 5. state = 0 i = 0 while i < len(string): if state == 0: # We are looking for '<'. if string[i] == '<': state = 1 elif state == 1: # We are looking for 'a '. if string[i] == 'a' or string[i] == 'A': if string[i + 1].isspace(): state = 2 i += 1 else: state = 0 elif not string[i].isspace(): state = 0 elif state == 2: # We are looking for 'href'. if string[i] == 'h' or string[i] == 'H': # !! This is buggy if the file terminates unexpectedly. if string[i + 1:i + 4] == 'ref' or string[i + 1:i + 4] == 'REF': state = 3 i += 3 else: state = 0 elif not string[i].isspace(): state = 0 elif state == 3: # We are looking for '='. if string[i] == '=': state = 4 elif not string[i].isspace(): state = 0 elif state == 4: # We are looking for '"'. if string[i] == '"': state = 5 elif not string[i].isspace(): state = 0 elif state == 5: # We read off the URL, which terminates at '"'. url = '' while i < len(string) and string[i] != '"': url = url + string[i] i += 1 urls.append(url) state = 0 i += 1 return urls from urllib import urlopen from graph import * # If the user ran this file explicitly, then run this demo code. if __name__ == "__main__": # Fetch web page (i.e. a string of HTML) for the current URL. file = urlopen('http://www.nytimes.com/') html = file.read() file.close() # Parse the web page to capture all URLs in contexts. urls = links(html) # Keep only URLs that begin with 'http:' and end with '.html'. urls = [url for url in urls if len(url) >= 5 and url[:5] == 'http:' and url[-5:] == '.html'] print urls