import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pymysql
# 指定WebDriver的路徑
webdriver_path = 'C:/chromedriver/chromedriver.exe'
# 創建Service對象
service = Service(executable_path=webdriver_path)
# 連接數據庫
db = pymysql.connect(host='127.0.0.1', user='root', password='******', db='lei_db', charset='utf8mb4')
# 創建遊標對象
cursor = db.cursor()
# 初始化WebDriver
driver = webdriver.Chrome(service=service)
# 打開目標網頁
driver.get("https://www.leichina.org/cei/2935720/2935943/index.html")
time.sleep(10)
driver.maximize_window()
# 等待頁面加載完成
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# 定位iframe並切換上下文
iframe = driver.find_element(By.ID, "frame2")
driver.switch_to.frame(iframe)
# 定位“更多”按鈕並點擊
more_btn = driver.find_element(By.ID, "moreBtn")
more_btn.click()
# 等待select元素出現
select = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#btn2 > select")))
# 使用Select類來處理下拉框
select = Select(select)
select.select_by_index(1)
# 定位驗證碼並刷icon
icon_element = driver.find_element(By.ID, "changepic")
icon_element.click() # 點擊刷新icon,刷新它
# 等待驗證碼圖片加載完成
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#imgcode")))
# 這裏等待用戶手動輸入驗證碼
user_input = input("請打開驗證碼圖片URL,在瀏覽器中查看驗證碼,並輸入驗證碼:")
# 定位驗證碼輸入框並輸入驗證碼
vcode_input = driver.find_element(By.CSS_SELECTOR, "#searchForm\:vCode")
vcode_input.send_keys(user_input)
# 定位查詢按鈕並點擊
query_btn = driver.find_element(By.ID, "searchForm:j_id36")
query_btn.click()
time.sleep(10)
# 等待查詢結果表格加載完成
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData")))
# 解析查詢結果
total_pages = 10692
for page in range(1, total_pages + 1):
print(f"正在處理第{page}頁...")
time.sleep(10)
# 等待表格加載完成
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData")))
#僅用於獲取行數 因爲最後一頁行數可能小於10
table = driver.find_element(By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData\:tb")
rows = table.find_elements(By.TAG_NAME, "tr")
r_count = len(rows)
for i in range(0,r_count):
# 解析表格
table = driver.find_element(By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData\:tb")
# rows = table.find_elements(By.TAG_NAME, "tr")[1:] # 跳過表頭
rows = table.find_elements(By.TAG_NAME, "tr")
row=rows[i]
cells = row.find_elements(By.TAG_NAME, "td")
lei = cells[0].text
company_cn_name = cells[1].text
status = cells[2].text
address = cells[3].text
# 模擬點擊進入詳情頁
detail_link = cells[4].find_element(By.TAG_NAME, "a")
detail_link.click()
time.sleep(5)
# 等待詳情頁加載完成
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# 定位驗證碼輸入框並輸入驗證碼
company_en_name = driver.find_element(By.CSS_SELECTOR,
"#j_id3\:j_id6 > table > tbody > tr > td > table > tbody > tr:nth-child(2) > td.txtLeft").text
# 執行SQL語句,插入數據
sql = "INSERT INTO lei (lei, company_cn_name, status, address, company_en_name) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(sql, (lei, company_cn_name, status, address, company_en_name))
# print(f"lei: {lei}")
# print(f"company_cn_name: {company_cn_name}")
# print(f"status: {status}")
# print(f"address: {address}")
# print(f"company_en_name: {company_en_name}")
# 定位返回按鈕並點擊
back_btn = driver.find_element(By.CSS_SELECTOR,
"#j_id3 > table > tbody > tr > td > table > tbody > tr:nth-child(1) > td > table > tbody > tr > td > table:nth-child(15) > tbody > tr:nth-child(2) > td > table > tbody > tr > td:nth-child(2) > a > img")
back_btn.click()
time.sleep(5)
# 提交事務
db.commit()
print(f"已經爬取第{i+1}條...")
# 翻頁
if page <= total_pages:
next_page_btn = driver.find_element(By.CSS_SELECTOR, '#resultForm > table > tbody > tr:nth-child(1) > td > table > tbody > tr > td > table:nth-child(2) > tbody > tr:nth-child(2) > td > a:nth-child(9)')
next_page_btn.click()
time.sleep(20) # 等待頁面加載完成
# 關閉數據庫連接
cursor.close()
db.close()
selenium+mysql 爬取LEI官網數據
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.