還是爬取必應在線詞典的內容。先放一段代碼和兩張圖片,改天填坑。
import urllib.request
import re
from bs4 import BeautifulSoup
myRule5 = r'((http|ftp|https)://)?(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?'
compile_name5 = re.compile(myRule5, re.M)
myRule6 = r'.*?\.com|.*?\.cn'
compile_name6 = re.compile(myRule6, re.M)
def readData(dataPath, fileName):
file = open(dataPath + fileName, 'r', encoding='UTF-8')
nameList = file.readlines()
return nameList
def grab(dataPath, fileName, resultPath, style):
nameList = readData(dataPath, fileName)
for word in nameList:
# web_addr = "https://cn.bing.com/dict/search?q=" + str(EnglishWord) + "&qs=n&form=Z9LH5&sp=-1&pq=" + str(EnglishWord) + "&sc=8-8&sk=&cvid=AA1C8A024F484889A4B788484086A7DF"
# url = "https://cn.bing.com/dict/search?q=" + str(word).replace("\n", "") + "&qs=n&form=Z9LH5&sp=-1&pq=" + str(word).replace("\n", "") + "&sc=8-8&sk=&cvid=AA1C8A024F484889A4B788484086A7DF"
url = "https://cn.bing.com/dict/search?q=" + str(word).replace("\n", "")
res = urllib.request.urlopen(url)
res.encoding = 'utf-8'
resp = urllib.request.urlopen(url)
# 讀取網頁源碼內容
text = resp.read()
soup = BeautifulSoup(text, style)
print("word = " + str(word))
myList = open(resultPath + str(word).replace("\n", "") + ".txt", mode='w', encoding='utf-8')
# numbers = soup.select('div> div > div > div > div.se_n_d')
# acronyms = soup.select('div > div > div > div > div > div > span.b_regtxt')
# English_Words = soup.select('div > div > div > div > div > div > a.p1-8.b_regtxt')
all = soup.select('div > div > div > div > div > div > *.b_regtxt')
delete_small_letter_acronym = True
i = 0
for fuck in all:
used = []
from w3lib.html import remove_tags
shit = remove_tags(str(fuck))
op = re.findall(myRule5, str(shit))
qp = re.findall(myRule6, str(shit))
if (shit[len(shit) - 1] != ' ' and shit[
len(shit) - 1].isalnum() == False and shit[0].islower() == True and (len(op) == 0 and len(qp) == 0)):
if (len(shit) >= 20 and delete_small_letter_acronym == False and shit.capitalize() not in used):
print(shit.capitalize() + "23", file=myList)
used.append(shit.capitalize())
i += 1
if (shit[len(shit) - 1] != ' ' and len(shit) >= 20 and shit[0] != ' ' and shit[
len(shit) - 1].isalnum() == False and shit not in used and shit.capitalize() not in used):
if (shit[0].islower() == True and shit.capitalize() not in used):
print(shit.capitalize() + "24", file=myList)
used.append(shit.capitalize())
else:
print(shit + "25", file = myList)
used.append(shit)
delete_small_letter_acronym = False
i += 1
myList.close()
def is_Chinese(word):
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
if __name__ == '__main__':
# dataPath = "E://Document/English_Learning_Materials/Crawler/Word_Crawler_Meanings/test/"
dataPath = "/home/non_alphabetical/"
fileName = "result13.txt"
# resultPath = "E://Document/English_Learning_Materials/Crawler/Word_Crawler_Meanings/result/"
resultPath = "/home/Word_Meanings/result_13/"
style = 'lxml'
grab(dataPath, fileName, resultPath, style)