However, one of the things that is listed as being useful is the presence of the key search words in the body of the HTML page.
The following is a simple Python script that retrieves the text from a supplied URL creates a searchable tree using LXML, extracts the text and then counts the occurrences of non numeric words longer than three characters.
# Functions to scrape the body text from a web page
# and return the top 10 occurring words
# Currently does not play well with HTML comments
# Also note that the order of the printed results is subject to change,
# this can be important if there are more than one element with the
# same occurrence as the last element displayed
from lxml import html
import requests
import re
import collections
noisewords=['at','and','an','the','we','to','is','of','by','not','in','as','be','or','for']
def testgoodword(astring):
if astring.strip() in noisewords:
return False
else:
return True
def webscrape(aURL):
# Get the page from the URL
page = requests.get(aURL)
# Make an HTML tree from the text
tree = html.fromstring(page.content)
# Extract non script and non style text from the HTML tree
bodytext=""
for elt in tree.getiterator():
if elt.text is not None:
if elt.tag!="script" and elt.tag!="style":
if elt.text.strip()!='':
bodytext=bodytext+' '+elt.text
# Define a regular expression to extract words
# (one or more alphanumerics followed by white space character)
p = re.compile(r'\w\w+\s')
# Use a Counter collection to record the occurrences (the word is the key)
# Counter collections return zero if there is no element with a supplied key
c = collections.Counter()
# Iterate through the "words2 found by the regular expression
iterator=p.finditer(bodytext)
for match in iterator:
testword=match.group().strip().lower()
if not testword.isnumeric(): # Ignore numbers
if testgoodword(testword): # Only use non noise words
c[testword]+=1
return c
# Print the most common words
def print_webscrape(c):
print ('Most common:')
for word, count in c.most_common(10):
print ('\'%s\': %7d' % (word, count))
# Testing code
if __name__ == "__main__":
# execute only if run as a script
c=webscrape("https://en.wikipedia.org/wiki/Python_(programming_language)")
print_webscrape(c)
Most common:
'python': 189
'retrieved': 127
'programming': 49
'software': 35
'language': 29
'edit': 29
'with': 29
'pep': 26
'languages': 24
'march': 23
>>>
Most common:
'python': 189
'retrieved': 127
'programming': 49
'software': 35
'edit': 29
'language': 29
'with': 29
'pep': 26
'languages': 24
'march': 23
>>>
Most common:
'python': 189
'retrieved': 127
'programming': 49
'software': 35
'with': 29
'language': 29
'edit': 29
'pep': 26
'languages': 24
'march': 23
'february': 23
'org': 23
'from': 21
'van': 20
'december': 18
A you can see, there are three results with a count of 23. The Counter.most_common() function will return them in a random order