1️⃣ 任务要求
2️⃣ 命令功能
- $build$
- 此命令指示搜索工具对网站进行爬取、生成反向索引并保存结果。索引存到文件系统中。为了简单起见,将整个索引保存在一个文件中。
- $load$
- 此命令从文件系统加载索引。显然,这个命令只有在,索引以前是使用$build$命令创建的。
- $print$
- 此命令打印一个单词的反向索引信息,例如:
- $print\quad Peso$
- $find$
- 此命令用于在反向索引中查找某个查询短语,并返回所有查询短语的列表,包含此短语的页面,例如:
- $find \quad Dinar$,返回所有包含
Dinar
的页面;
- $find \quad Area \quad Afghanistan$,返回所有同时包含这两个单词的网页列表。
3️⃣ 具体程序
import time
import re
import os
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from alive_progress import alive_bar
url_index_prefix = "http://example.webscraping.com/places/default/index/"
url_info_prefix = "http://example.webscraping.com"
inverted_index_dict = {}
length = 252
header = {
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
'Accept-Encoding': "gzip, deflate",
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'Cache-Control': "max-age=0",
'Cookie': "session_id_places=True; session_data_places=\"586ad5c755d830e432c6e80f8b9a822a:xLrqTGkuTTFaRdOtQTpde-UcgSMy7nwOrXyEeyRafNjWT8t7J\
bHjZGf1cYO6bcnIVhwOHVNpJiMnr32rtSCF2_RSOUfBX4gRmU09KTNfMczD2vc4aaloPAvNE6gLStboj-EBBnFkWhVP3uCd8woSyXnTQwYi39HKoujz4iX1tJA5O4dr7z3VCn22mvev_\
MZaNSW4TT1jTJUZoF_3hyqtoN8rTL_Mjpu02ACJscaG6lRfQmIOBZ-BloR7aT4s-it19e0JYkbpynKb-an8f72IRhiN-thhyXeYbo6SCX0LzAra6Il1zM4Zpw9GkQFU2yha",
'Host': "example.webscraping.com",
'Proxy-Connection': "keep-alive",
'Referer': "http://example.webscraping.com/places/default/index/1",
'Upgrade-Insecure-Requests': "1",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
def SplitList2Words(WordList):
words = []
for phrase in WordList:
phrase = str(phrase)
if " " in phrase:
for word in phrase.split(" "):
if word != '>': # del '>' from words
words.append(word)
elif "." in phrase:
words.append(phrase.split(".")[1]) # del '.' from words
elif "," in phrase:
for word in phrase.split(","): # del ',' from words
words.append(word)
else:
words.append(phrase)
return words
def GenerateInvertedIndex(words, inverted_index_dict, url):
for w in words:
if w in inverted_index_dict.keys():
if url in inverted_index_dict[w].keys():
# add to the number of occurences
inverted_index_dict[w][url] += 1
else:
# add this page link to the word dict value
inverted_index_dict[w][url] = 1
else:
# did not encounter this word before
inverted_index_dict[w] = {url: 1}
js = json.dumps(inverted_index_dict)
file = open('InvertedIndex.txt', 'w')
file.write(js)
file.close()
return inverted_index_dict
def build():
inverted_index_dict = {}
with alive_bar(length) as bar:
for page in range(26):
header['Referer'] = url_index_prefix
while True:
try:
# Set timeout to 10 seconds
r = requests.get(url_index_prefix + str(page), headers=header, timeout=10)
break
except:
print("TimeOut Error, reconnecting...")
time.sleep(2)
soup_main = BeautifulSoup(r.text, 'lxml')
main_word_list = [str(w.text).strip() for w in soup_main.find_all('a')]
main_word_list.append(str(soup_main.h1.text).strip())
words = SplitList2Words(main_word_list)
inverted_index_dict = GenerateInvertedIndex(
words, inverted_index_dict, url_index_prefix + str(page))
url_suffix = re.findall(r'/places/default/view/[A-Za-z]+\S+[A-Za-z]+[-]+[0-9]+', r.text)
print("Start to crawl page %d !" % (page))
time.sleep(2)
for info_url in url_suffix:
bar()
info_word_list = []
crawl_url = url_info_prefix + info_url
header['Referer'] = url_index_prefix + str(page)
while True:
try:
r_info = requests.get(crawl_url, headers=header, timeout=10)
break
except:
print("TimeOut Error, reconnecting...")
time.sleep(2)
soup_info = BeautifulSoup(r_info.text, 'lxml')
info_word_list = [str(w.text).strip() for w in soup_info.find_all('a')]
info_word_list.append(str(soup_info.h1.text).strip())
title_info = SplitList2Words(info_word_list)
inverted_index_dict = GenerateInvertedIndex(
title_info, inverted_index_dict, crawl_url)
country = pd.read_html(r_info.text)
country_title = [x.split(":")[0] for x in country[0][0]]
country_info = [info for info in country[0][1]]
inverted_index_dict = GenerateInvertedIndex(
country_title, inverted_index_dict, crawl_url)
country_info = SplitList2Words(country_info)
inverted_index_dict = GenerateInvertedIndex(
country_info, inverted_index_dict, crawl_url)
print("Country \"%s\" was crawled!" % (country[0][1][4]))
time.sleep(5)
print("Finished crawling page %d" % (page))
def load():
file = open('InvertedIndex.txt', 'r')
js = file.read()
dic = json.loads(js)
file.close()
return dic
if __name__ == "__main__":
while True:
command = input()
if command == 'build':
build()
elif command == 'load':
if not os.path.exists('InvertedIndex.txt'):
print("Can not find the Inverted Index File, please 'build' first!")
else:
inverted_index_dict = load()
print("Load file 'InvertedIndex.txt' successfully!")
elif command.split(" ")[0] == 'print':
try:
beautiful_format = json.dumps(inverted_index_dict[command.split(" ")[1]], indent=4, ensure_ascii=False)
print(beautiful_format)
except:
print("The index \'%s\' doesn't exist!" % (command.split(" ")[1]))
elif command.split(" ")[0] == 'find':
try:
if len(command.split(" ")) == 2:
for i in list(inverted_index_dict[command.split(" ")[1]].keys()):
print(i)
except:
print("The index \'%s\' doesn't exist!" % (command[5:]))
else:
try:
url_list = []
for word in command.split(" ")[1:]:
url_list.append(list(inverted_index_dict[word].keys()))
for i in set(url_list[0]).intersection(*url_list[1:]):
print(i)
except:
print("The intersection of \'%s\' doesn't exist!" % (command[5:]))
else:
print("Please input the right command!")
评论区