爬取150708個英語單詞的例句

還是爬取必應在線詞典的內容。先放一段代碼和兩張圖片,改天填坑。

import urllib.request
import re
from bs4 import BeautifulSoup

myRule5 = r'((http|ftp|https)://)?(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?'
compile_name5 = re.compile(myRule5, re.M)
myRule6 = r'.*?\.com|.*?\.cn'
compile_name6 = re.compile(myRule6, re.M)


def readData(dataPath, fileName):
    file = open(dataPath + fileName, 'r', encoding='UTF-8')
    nameList = file.readlines()
    return nameList

def grab(dataPath, fileName, resultPath, style):
    nameList = readData(dataPath, fileName)

    for word in nameList:
        # web_addr = "https://cn.bing.com/dict/search?q=" + str(EnglishWord) + "&qs=n&form=Z9LH5&sp=-1&pq=" + str(EnglishWord) + "&sc=8-8&sk=&cvid=AA1C8A024F484889A4B788484086A7DF"
        # url = "https://cn.bing.com/dict/search?q=" + str(word).replace("\n", "") + "&qs=n&form=Z9LH5&sp=-1&pq=" + str(word).replace("\n", "") + "&sc=8-8&sk=&cvid=AA1C8A024F484889A4B788484086A7DF"
        url = "https://cn.bing.com/dict/search?q=" + str(word).replace("\n", "")
        res = urllib.request.urlopen(url)
        res.encoding = 'utf-8'
        resp = urllib.request.urlopen(url)
        # 讀取網頁源碼內容
        text = resp.read()
        soup = BeautifulSoup(text, style)
        print("word = " + str(word))
        myList = open(resultPath + str(word).replace("\n", "") + ".txt", mode='w', encoding='utf-8')

        # numbers = soup.select('div> div > div > div > div.se_n_d')
        # acronyms = soup.select('div > div > div > div > div > div > span.b_regtxt')
        # English_Words = soup.select('div > div > div > div > div > div > a.p1-8.b_regtxt')
        all = soup.select('div > div > div > div > div > div > *.b_regtxt')

        delete_small_letter_acronym = True
        i = 0
        for fuck in all:
            used = []
            from w3lib.html import remove_tags

            shit = remove_tags(str(fuck))

            op = re.findall(myRule5, str(shit))
            qp = re.findall(myRule6, str(shit))
            if (shit[len(shit) - 1] != ' ' and shit[
                len(shit) - 1].isalnum() == False and shit[0].islower() == True and (len(op) == 0 and len(qp) == 0)):
                if (len(shit) >= 20 and delete_small_letter_acronym == False and shit.capitalize() not in used):
                    print(shit.capitalize() + "23", file=myList)
                    used.append(shit.capitalize())
                    i += 1

            if (shit[len(shit) - 1] != ' ' and len(shit) >= 20 and shit[0] != ' ' and shit[
                len(shit) - 1].isalnum() == False and shit not in used and shit.capitalize() not in used):
                if (shit[0].islower() == True and shit.capitalize() not in used):
                    print(shit.capitalize() + "24", file=myList)
                    used.append(shit.capitalize())
                else:
                    print(shit + "25", file = myList)
                    used.append(shit)
                delete_small_letter_acronym = False
                i += 1
        myList.close()

def is_Chinese(word):
    for ch in word:
        if '\u4e00' <= ch <= '\u9fff':
            return True
    return False

if __name__ == '__main__':
    # dataPath = "E://Document/English_Learning_Materials/Crawler/Word_Crawler_Meanings/test/"
    dataPath = "/home/non_alphabetical/"
    fileName = "result13.txt"
    # resultPath = "E://Document/English_Learning_Materials/Crawler/Word_Crawler_Meanings/result/"
    resultPath = "/home/Word_Meanings/result_13/"
    style = 'lxml'
    grab(dataPath, fileName, resultPath, style)

 

圖1 16個進程同時爬取

 

 

圖2 16個爬蟲進程同時運行時服務器狀態

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章