根據指定漢語關鍵字獲取語料數據

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from lxml import etree
import time



option = ChromeOptions()
option.add_argument(
    'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"'
)
browser = webdriver.Chrome(options=option)
browser.maximize_window()  # 頁面最大化


def get_content(keyword):
    url = 'http://corpus.zhonghuayuwen.org/ACindex.aspx'
    browser.get(url)
    input_tag = browser.find_element_by_id('TextBoxACkeywords')   #獲取搜索框元素
    input_tag.send_keys(keyword)     #輸入關鍵字
    input_tag.send_keys(Keys.ENTER)  #回車
    WebDriverWait(browser,5)
    current_page = 0
    while True:
        try:
            lists = []
            HTML = etree.HTML(browser.page_source)
            text_lists = HTML.xpath('//*[@id="PanellSResults"]/div/span[position()>3]')
            if text_lists:
                current_page += 1
                print('\n------------------------------當前關鍵字:《{}》,當前頁碼:{}------------------------------。'.format(keyword,current_page))
                for i in text_lists:
                    text_list = ''.join(i.xpath('.//text()'))
                    lists.append(text_list)
                step = 3
                item_lists = [lists[k:k+step] for k in range(0,len(lists),step)]  #處理合適的數據結構
                for item in item_lists:
                    text_info = ''.join(item)
                    save_keyword_info(text_info)
                time.sleep(5)
                next_button = browser.find_element_by_link_text('下一頁')  #循環點擊下一頁
                next_button.click()
            else:
                print('檢索不到:《{}》關鍵字的語料信息。\n'.format(keyword))
                invalid_keyword(keyword)
                break
        except:
            break


#記錄找不到結果的關鍵字信息
def invalid_keyword(keyword):
    with open('invalid_data.txt','a+')as f:
        f.write(keyword + '\n')


#保存關鍵字語料信息
def save_keyword_info(text_info):
    with open('corpus_data_01.txt','a+',encoding='utf-8')as f:
        f.write(text_info + '\n')
    print(text_info)

#讀取關鍵字文件
def read_text():
    with open('生僻字++.txt','r',encoding='utf-8')as f:
        data_lists = f.readlines()
    for i in data_lists:
        keyword = i.strip()
        print('\n開始抓取關鍵字:《{}》。'.format(keyword))
        get_content(keyword)
        time.sleep(8)





if __name__ == '__main__':
    read_text()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章