Sunday, 15 September 2019

Traverse a website with Python

One thing that is important when building a website is that the links on the site connect up.

One way of doing that is to find all the links and then use them.

The following code will find all the links on the first page, and then will visit all the links that point to the same site.

Do not point this at Wikipedia!
# Functions to traverse a website
from lxml import html
import requests
def web_traverse(aURL):
    siteURLs=[]
    
# Get the page from the URL
    page = requests.get(aURL)
# Make an HTML tree from the text
    tree = html.fromstring(page.content)
    tree.make_links_absolute(aURL, resolve_base_href=True)
# Extract urls from the HTML tree
    for  alink in tree.xpath("//a"):
        siteURLs.append(alink.get("href"))
    return siteURLs

def recursive_traverse(aBaseURL,aURL,aLinks):
    print("Traversing " +aURL)
    siteURLs=web_traverse(aURL)
    if siteURLs is not None:
        #print(siteURLs)
        for aLink in siteURLs:
            if aLink is not None:
                if aBaseURL in aLink :
                    if aLink not in aLinks:
                        print("found link:"+aLink)
                        aLinks.append(aLink)
                        aLinks=recursive_traverse(aBaseURL,aLink,aLinks)
    return aLinks
    

    
if __name__ == "__main__":
    # execute only if run as a script
    baseURL="https://technologyisnotdull.blogspot.com"
    #c=web_traverse(baseURL)
    c=recursive_traverse(baseURL,baseURL,[])
    if True:
        print("Links found:")
        for thing in c:
            print (thing )
            #print ((len(thing)))