from selenium import webdriver from selenium.webdriver import ChromeOptions from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.wait import WebDriverWait from lxml import etree import time option = ChromeOptions() option.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"' ) browser = webdriver.Chrome(options=option) browser.maximize_window() # 頁面最大化 def get_content(keyword): url = 'http://corpus.zhonghuayuwen.org/ACindex.aspx' browser.get(url) input_tag = browser.find_element_by_id('TextBoxACkeywords') #獲取搜索框元素 input_tag.send_keys(keyword) #輸入關鍵字 input_tag.send_keys(Keys.ENTER) #回車 WebDriverWait(browser,5) current_page = 0 while True: try: lists = [] HTML = etree.HTML(browser.page_source) text_lists = HTML.xpath('//*[@id="PanellSResults"]/div/span[position()>3]') if text_lists: current_page += 1 print('\n------------------------------當前關鍵字:《{}》,當前頁碼:{}------------------------------。'.format(keyword,current_page)) for i in text_lists: text_list = ''.join(i.xpath('.//text()')) lists.append(text_list) step = 3 item_lists = [lists[k:k+step] for k in range(0,len(lists),step)] #處理合適的數據結構 for item in item_lists: text_info = ''.join(item) save_keyword_info(text_info) time.sleep(5) next_button = browser.find_element_by_link_text('下一頁') #循環點擊下一頁 next_button.click() else: print('檢索不到:《{}》關鍵字的語料信息。\n'.format(keyword)) invalid_keyword(keyword) break except: break #記錄找不到結果的關鍵字信息 def invalid_keyword(keyword): with open('invalid_data.txt','a+')as f: f.write(keyword + '\n') #保存關鍵字語料信息 def save_keyword_info(text_info): with open('corpus_data_01.txt','a+',encoding='utf-8')as f: f.write(text_info + '\n') print(text_info) #讀取關鍵字文件 def read_text(): with open('生僻字++.txt','r',encoding='utf-8')as f: data_lists = f.readlines() for i in data_lists: keyword = i.strip() print('\n開始抓取關鍵字:《{}》。'.format(keyword)) get_content(keyword) time.sleep(8) if __name__ == '__main__': read_text()