# webcrawler.py: Performs a breadth-first or depth-first search of (a part 
# of) the World Wide Web, starting at some given web page, to solve some 
# problem of interest.
# Author: Joshua R. Davis
# Author: !! YOUR NAME HERE.
# Date: !!

# Given a string of HTML representing a web page, returns a list of strings, 
# namely all of the URLs that appear as links on the page. Warning: Some of 
# the URLs are relative, hence incomplete. To complete them, you need to 
# know the base href of the web page; I don't have code to do this.
def links(string):
    urls = []
    # This is done using a finite-state automaton with 6 states, 0 to 5.
    state = 0
    i = 0
    while i < len(string):
        if state == 0:
            # We are looking for '<'.
            if string[i] == '<':
                state = 1
        elif state == 1:
            # We are looking for 'a '.
            if string[i] == 'a' or string[i] == 'A':
                if string[i + 1].isspace():
                    state = 2
                    i += 1
                else:
                    state = 0
            elif not string[i].isspace():
                state = 0
        elif state == 2:
            # We are looking for 'href'.
            if string[i] == 'h' or string[i] == 'H':
                # !! This is buggy if the file terminates unexpectedly.
                if string[i + 1:i + 4] == 'ref' or string[i + 1:i + 4] == 'REF':
                    state = 3
                    i += 3
                else:
                    state = 0
            elif not string[i].isspace():
                state = 0
        elif state == 3:
            # We are looking for '='.
            if string[i] == '=':
                state = 4
            elif not string[i].isspace():
                state = 0
        elif state == 4:
            # We are looking for '"'.
            if string[i] == '"':
                state = 5
            elif not string[i].isspace():
                state = 0
        elif state == 5:
            # We read off the URL, which terminates at '"'.
            url = ''
            while i < len(string) and string[i] != '"':
                url = url + string[i]
                i += 1
            urls.append(url)
            state = 0
        i += 1
    return urls

from urllib import urlopen
from graph import *

# If the user ran this file explicitly, then run this demo code.
if __name__ == "__main__":
    # Fetch web page (i.e. a string of HTML) for the current URL.
    file = urlopen('http://www.nytimes.com/')
    html = file.read()
    file.close()
    # Parse the web page to capture all URLs in <a href=""> contexts.
    urls = links(html)
    # Keep only URLs that begin with 'http:' and end with '.html'.
    urls = [url for url in urls if len(url) >= 5 and url[:5] == 'http:' and url[-5:] == '.html']
    print urls
