爬蟲總結(個人小結)

爬蟲總結

方法:
1、from urllib import request
·
1、訪問網站
#url
url = ‘
#請求url、headers、cook等
rsp = request.Request(url)
#添加頭部
#req.add_header(‘User-Agent’,“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36”)
#訪問網址
html = request.urlopen(req)
#打印訪問的網址(重定向之後的)
print(html.geturl())
#打印headers
print(html.info())
#打印狀態碼
print(html.getcode)
#解碼
html = html.read().decode()
·
2、cook
from http import cookiejar
cookie = cookiejar.MozillaCookieJar(fillname)
cookie_handler = request.HTTPCookieProcessor(cookie)
http_handler = request.HTTPHandler()
https_handler = request.HTTPSHandler()
#裝載
operen = request.build_opener(http_handler,https_handler,cookie_handler)
#用operen訪問
rsq = operen.open(req)
#保存cookie
cookie.save()
·
3、parse
data={
'
’:"
**"
}
data = parse.urlencode(data)
#注意編碼
req = request.Request(url,data=data.encode(“utf-8”))
``
4、error
from urllib import error
try:
***
except error,URLError as e:
***
except Exception as e:
***
·
5、代理
proxy = {}
proxy[‘http’] = ‘119.179.130.59:8060’
proxy_hander = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_hander)
request.install_opener(opener)
eg:v13
eg:02-多個代理
eg:05_daili
·
6、ssl安全
import ssl
ssl._create_default_https_context = ssl._create_default_https_context
7、保存圖片
img_src = ‘http://724.169pp.net/bizhi/2017/039/1.jpg
request.urlretrieve(img_src,r’C:\Users\machenike\Desktop\1.jpg’)
#下載進度
def Schedule(blocknum, blocksize, totalsize):
‘’’
:param blocknum: 已下載的數據塊
:param blocksize: 數據塊大小
:param totalsize: 遠程文件大小
:return:
‘’’
per = 100.0 * blocknum * blocksize / totalsize
if per > 100:
per = 100
print(‘當前下載進度爲:{}%’.format(int(per)))
#加上進度函數
request.urlretrieve(src, path + ‘/’ + src.split(’/’)[-1], Schedule)

2、import requests
1、訪問
url = “http://www.baidu.com
#get請求
rsp = requests.get(url)
#打印內容
print(rsp.text)
2、代理
proxy = {
“http”:“39.137.107.98:80”
}
#get方法,url,代理
rsp = requests.request(“get”,url,proxies=proxy)
3、搜索
kv = {‘wd’:‘Python’}
r = requests.get(url,params=kv)
4、ssl
#在requests做請求的時候,爲了避免ssl認證,可以將verify=False, 但是這麼設置會帶來一個問題,日誌中會有大量的warning信息
pic = requests.get(pho[0], headers=headers, verify=True)

3、selenium
地址:https://www.cnblogs.com/miqi1992/p/8093958.html
from selenium import webdriver
#打開Chrome
driver = webdriver.Chrome()
#訪問url
driver.get(“http://www.baidu.com”)
#獲取頁面名爲wraper的id標籤的文本內容
text = drive.find_element_by_id(“wrapper”).text
#打印數據內容
print(drive.title)
#獲取新的頁面快照
driver.save_screenshot(“長城.png”)
#找到輸入窗口並輸入’大熊貓’
drive.find_element_by_id(“kw”).send_keys(u"大熊貓")
#點擊搜索
drive.find_element_by_id(‘su’).click()
#獲取當前頁面Cookie
print(driver.get_cookies())
#ctrl+a全選輸入框內容
driver.find_element_by_id(‘kw’).send_keys(Keys.CONTROL, ‘a’)
#模擬Enter回車鍵
driver.find_element_by_id(‘su’).send_keys(Keys.RETURN)
#清空輸入框內容
driver.find_element_by_id(‘kw’).clear()
#獲取當前url
print(driver.current_url)
#下滑10000碼
s=“var q=document.documentElement.scrollTop=10000”
driver.execute_script(js)
#退出
driver.quit()
4、Scrapy框架
# 創建scrapy
cmd 中 “scrapy startproject e11”
# 在spiders文件夾裏創建spider
spider文件夾中創建
class **Spider(scrapy.Spider):

· name = ‘**’
allowed_domains = [‘careers.tencent.com’]

· start_urls = [‘https://careers.tencent.com/search.html?&start=0#a’]

· def parse(self, response):

· 要執行的東西
繼承items裏的item
yield 返回

· 在items裏創建所需要存放數據的item
class QQItem(scrapy.Item):
name = scrapy.Field()
detailLink = scrapy.Field()
positionInfo = scrapy.Field()
workLocation = scrapy.Field()

· 在pipelines中創建處理爬下來的數據
class QQPipeline(object):

· def process_item(self, item, spider):
with open(‘QQ.json’, ‘a’)as f:
json.dump(dict(item), f, encoding=‘utf-8’, ensure_ascii=False)
return item #必須要

· 在settings中設置優先級
#pipeline
ITEM_PIPELINES = {
‘e16_qq.pipelines.QQPipeline’: 300,
}
#如果設置了middlewares也要設置這個
DOWNLOADER_MIDDLEWARES = {
‘e17_xiaohua.middlewares.XiaohuaDownloaderMiddleware’: 543,
}

· 在middlewares中設置爬取方式
class MeijuDownloaderMiddleware(object):

· def process(self, request, spider):
drive = webdriver.Chrome()
drive.get(request.url)

· html = drive.page_source
time.sleep(1)
drive.quit()
#必須有
return HtmlResponse(url=request.url, body=html, encoding=‘utf-8’, request=request)

· 在spider中 主動關閉爬蟲:
self.crawler.engine.close_spider(self, “cookie失效關閉爬蟲”)

· 在pipeline 和downloadermiddlewares 主動關閉爬蟲:
spider.crawler.engine.close_spider(spider, “全文結束關閉爬蟲”)

5、圖文識別
import pytesseract as pt
from PIL import Image
#路徑
image = Image.open(“1-26.jpg”)
text = pt.image_to_string(image)
print(text)
6、mysql數據庫
eg:18_mysqllianjiejianbiao
eg:19_mysqlcharu
eg:20_mysqlchaxun
eg:21_mysqlgengxin
eg:22_mysqldelete
#連接
db = pymysql.connect(host=‘192.168.43.245’, user=‘shiboven’, passwd=‘xqx521’, db=‘mysql’, port=3306)
# 創建遊標,對數據進行操作使用cursor()方法
cursor = db.cursor()
# 使用execute()方法創建sql語句
cursor.execute(‘DROP TABLES IF EXISTS JBTLXY’)
# 使用預處理語句創建表
cursor.execute(sql)
db.close()
7、mongoDB數據庫
eg:hupu_mongoDB
eg:23_mongoconnect
eg:24_mongoinstall
# 鏈接數據庫
client = pymongo.MongoClient()
# 獲取到數據庫 鏈接數據庫
db = client.TBTL_tea
# 獲取集合
std = db.posts
# 獲取數據
datas = std.find()
eg:04_kugou_mongoDB_bs4
方法1
datas = []
data = {
‘href’: href,
‘songer’: songer,
‘song’: song,
‘time’: time,
‘rank’: rank
}
datas.append(data)
client = pymongo.MongoClient()
songs = client.KG_DB.songs
songs_id = songs.insert_many(data)
方法2
paiming = []
mingzi = []
wangzhi = []
zuozhe = []
shijian = []
yuedu = []
pinglun = []
paiming.append(rank)
mingzi.append(title)
wangzhi.append(url)
zuozhe.append(Author)
shijian.append(Time)
yuedu.append(Comment)
pinglun.append(Reply)
items = zip(paiming, mingzi, wangzhi, zuozhe, shijian, yuedu, pinglun)
hupu_post = MongoAPI(db_name=‘new_hupu’, table_name=‘post’)
for item in items:
#add在hupu_mongoDB
hupu_post.add({
‘rank’: item[0],
‘title’: item[1],
‘url’: item[2],
‘Author’: item[3],
‘Time’: item[4],
‘Reply’: item[5],
‘Comment’: item[6],
})

工具:
1、re正則
#查找括號內的東西
s = r’

(.*?)

pattern = re.compile(s,re.S)
films = pattern.findall(str)
m = pattern.match(“one12two2three3”,3,10)
#匹配的整個表達式的字符串
print(m.group())
#返回匹配開始的位置
print(m.start(0))
# 返回匹配結束的位置
print(m.end(0))
#返回一個元組包含匹配 (開始,結束) 的位置
print(m.span(0))
地址:https://www.runoob.com/python3/python3-reg-expressions.html

2、xpath
print(book.xpath(’.//div[@class=“title”]/a[@href]’)[0].attrib[‘href’])
# 同上 取所有字符串
print(book.xpath(’.//div[@class=“title”]/a’)[0].text)
men = response.xpath(’//div[@class=“item_list infinite_scroll”]/div’)
for man in men:
item[‘name’] = man.xpath(’./div/div/a/img/@alt’).extract()
item[‘src’] = man.xpath(’./div/div/a/img/@src’).extract()
item[‘href’] = man.xpath(’./div/div/a/@href’).extract()[0]
地址:https://www.runoob.com/xpath/xpath-tutorial.html

3、Bs4
地址:https://cuiqingcai.com/1319.html
soup = BeautifulSoup(html,‘lxml’)
#soup.prettify() --> 格式化打印出了它的內容。
#html = soup.prettify()
divs = soup.select(“li[class=‘media’]”)
for div in divs:
name = div.select(‘div h3 a’)[0].get_text()
href = div.select(‘div h3 a’)[0].attrs[‘href’]
#直接獲取某個關鍵詞下邊的內容
#
# 1
#

rank = soup.select(".pc_temp_num")
soup = BeautifulSoup(res.text,‘lxml’)
items = soup.find(‘ul’,{‘class’:‘f-hide’}).find_all(‘a’)
#循環裏邊
id = item.get(‘href’)

4、pandas存儲
eg:03_pandascuncsv
datas = pd.DataFrame({
‘name’:names,
‘id’:ids
})
datas.to_csv(‘movids.csv’)
#打印前五行
print(datas.head())
# 總結type有幾個 平均boxoffice是多少 然後排序
# print(datas.groupby(‘type’).agg({‘boxoffice’:[‘count’,‘mean’]}))
print(pd.read_csv(‘kuwo.csv’,encoding=‘gbk’).head())

5、tkinter
eg:07_tkinter_wangyiyun
# 創建播放器
root = Tk()
# 標題
root.title(‘網易雲音樂下載器’)
# 設置大小
root.geometry(“700x550”)
root.geometry("+700+80")
# 設置下載器標籤:請輸入您的下載的地址
lable = Label(root, text=“請輸入您下載的地址:”, font=(‘隸書’, 22))
# 定位 pack_olace_grid
lable.grid()
# 設置輸入框
entry = Entry(root, font=(‘隸書’, 22), width=25)
entry.grid(row=0, column=1)
# 設置列表框
text = Listbox(root, font=(‘隸書’, 22), width=46, height=14)
text.grid(row=1, columnspan=2)
# 設置按鈕 NSWE
button1 = Button(root, text=“開始”, font=(‘微軟雅黑’, 25), command=music_spider)
button1.grid(row=2, column=0, sticky=‘s’) # sticky對齊方式
# 退出按鈕
button2 = Button(root, text=“退出”, font=(‘微軟雅黑’, 25), comman=root.quit)
button2.grid(row=2, column=1, sticky=‘s’) # sticky對齊方式
# 顯示窗口,顯示消息迴環
root.mainloop()
def music_spider():
# 添加數據到空間中
text.insert(END, ‘下載完成:{}’.format(name))
# 文本框向下滾動
text.see(END)
# 更新
text.update()

注:小知識點
1、decode()解碼
encode()編碼

2、json
#裝載數據
json_data = json.loads(data)
#裝載json
print(json.loads(req.text))
print(req.json())
dumps是將dict轉化成str格式,loads是將str轉化成dict格式。
dump和load也是類似的功能,只是與文件操作結合起來了。
#loads和dumps解釋:https://www.cnblogs.com/wswang/p/5411826.html
eg:15_jsoncunchu

3、md5
import hashlib
md5 = hashlib.md5()
md5.update(’***’.encode(“utf-8”))
sign = md5.hexdigest()
print(sign)

4、random
#opener_list是一個列表
random.choice(opener_list)

5、os
path = ‘圖片/’
if not os.path.exists(path):
#mkdir單個路徑 makedirs多個路徑
os.mkdir(path)
with open(’{}{}.jpg’.format(path, name), ‘wb’)as f:
f.write(pic.content)

6、csv
with open(‘dmbj.csv’,‘w’,newline=’’) as f:
f_csv = csv.writer(f)
#寫一行
f_csv.writerow([‘書名’,‘章節名’,‘時間’,‘網址’])
#寫好多行
f_csv.writerows(contents)

7、zip
a = [‘a’,‘b’,‘c’]
b = [1,2,3]
x = dict(zip(a,b))
{‘a’:1,‘b’:2,‘c’:3}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章