Technology Is Not Dull: Traverse a website with Python

Sunday, 15 September 2019

Traverse a website with Python

One thing that is important when building a website is that the links on the site connect up.

One way of doing that is to find all the links and then use them.

The following code will find all the links on the first page, and then will visit all the links that point to the same site.

Do not point this at Wikipedia!

# Functions to traverse a website

from lxml import html

import requests

def web_traverse(aURL):

siteURLs=[]



# Get the page from the URL

page = requests.get(aURL)

# Make an HTML tree from the text

tree = html.fromstring(page.content)

tree.make_links_absolute(aURL, resolve_base_href=True)

# Extract urls from the HTML tree

for alink in tree.xpath("//a"):

siteURLs.append(alink.get("href"))

return siteURLs

def recursive_traverse(aBaseURL,aURL,aLinks):

print("Traversing " +aURL)

siteURLs=web_traverse(aURL)

if siteURLs is not None:

#print(siteURLs)

for aLink in siteURLs:

if aLink is not None:

if aBaseURL in aLink :

if aLink not in aLinks:

print("found link:"+aLink)

aLinks.append(aLink)

aLinks=recursive_traverse(aBaseURL,aLink,aLinks)

return aLinks





if __name__ == "__main__":

# execute only if run as a script

baseURL="https://technologyisnotdull.blogspot.com"

#c=web_traverse(baseURL)

c=recursive_traverse(baseURL,baseURL,[])

if True:

print("Links found:")

for thing in c:

print (thing )

#print ((len(thing)))