python-安居客-鄭州二手房銷售信息抓取

python版本：3.7功能描述抓取安居客-鄭州各區域內二手房銷售信息.代碼
# -*- coding: utf-8 -*-
"""
@site: http://www.wangxiaofeng.site
"""
import urllib3
urllib3.disable_warnings()
import sqlite3
import random
import threading
from bs4 import BeautifulSoup

# Some User Agents
hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}, \
       {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'}, \
       {
           'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, \
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'}, \
       {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, \
       {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, \
       {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, \
       {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]


lock = threading.Lock()


class SQLiteWraper(object):
    """
    數據庫的一個小封裝，更好的處理多線程寫入
    """

    def __init__(self, path, command='', *args, **kwargs):
        self.lock = threading.RLock()  # 鎖
        self.path = path  # 數據庫連接參數

        if command != '':
            conn = self.get_conn()
            cu = conn.cursor()
            cu.execute(command)

    def get_conn(self):
        conn = sqlite3.connect(self.path)  # ,check_same_thread=False)
        conn.text_factory = str
        return conn

    def conn_close(self, conn=None):
        conn.close()

    def conn_trans(func):
        def connection(self, *args, **kwargs):
            self.lock.acquire()
            conn = self.get_conn()
            kwargs['conn'] = conn
            rs = func(self, *args, **kwargs)
            self.conn_close(conn)
            self.lock.release()
            return rs

        return connection

    @conn_trans
    def execute(self, command, method_flag=0, conn=None):
        cu = conn.cursor()
        try:
            if not method_flag:
                cu.execute(command)
            else:
                cu.execute(command[0], command[1])
            conn.commit()
        except sqlite3.IntegrityError as e:
            # print e
            return -1
        except Exception as e:
            print
            e
            return -2
        return 0


def gen_ershoufang_insert_command(info_dict):
    """
    生成小區數據庫插入命令
    """
    info_list = [u'小區名稱', u'房屋戶型', u'單價', u'位置', u'面積', u'首付', u'年代', u'朝向', u'月供', u'房屋類型', u'樓層', u'裝修程度', u'產權年限',
                 u'電梯', u'房本年限', u'產權性質', u'唯一住房']
    t = []
    for il in info_list:
        if il in info_dict:
            t.append(info_dict[il])
        else:
            t.append('')
    t = tuple(t)
    commands = (r"insert into anjuhouse values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", t)
    # commands = (r"insert into anjuhouse values(?,?)", t)
    return commands


def ershoufang_spider(db_ershoufang, url_page):

    """
    爬取頁面鏈接中的二手房信息
    """
    try:
        # print(url_page)
        http = urllib3.PoolManager()
        req = http.request('GET', url_page, headers=hds[random.randint(0, len(hds) - 1)])
        source_code = req.data
        plain_text = source_code.decode('utf-8')
        soup = BeautifulSoup(plain_text, "html.parser")
        # print(soup)
        # exit
        cj_list = soup.findAll('div', {'class': 'houseInfo-content'})
        info_dict = {}
        info_dict.update({u'小區名稱': cj_list[0].get_text()})
        info_dict.update({u'房屋戶型': cj_list[1].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'單價': cj_list[2].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'位置': cj_list[3].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'面積': cj_list[4].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'首付': cj_list[5].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'年代': cj_list[6].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'朝向': cj_list[7].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'月供': cj_list[8].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'房屋類型': cj_list[9].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'樓層': cj_list[10].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'裝修程度': cj_list[11].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'產權年限': cj_list[12].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'電梯': cj_list[13].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'房本年限': cj_list[14].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'產權性質': cj_list[15].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'唯一住房': cj_list[16].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        commands = gen_ershoufang_insert_command(info_dict)
        db_ershoufang.execute(commands, 1)
        # print(cj_list[0].get_text())
        # exit
    except (urllib3.exceptions.HTTPError, urllib3.exceptions.NewConnectionError) as e:
        print
        e
        exit(-1)
    except Exception as e:
        print
        e
        exit(-2)

def db_ershoufang_spider(db_ershoufang, page = 1):

    url = u"https://zhengzhou.anjuke.com/sale/p%d/" % page
    # exit(url)
    try:
        http = urllib3.PoolManager()
        req = http.request('GET', url, headers=hds[random.randint(0, len(hds) - 1)])
        source_code = req.data
        # exit(source_code)
        plain_text = source_code.decode('utf-8')
        soup = BeautifulSoup(plain_text, "html.parser")
        list = []
        for viewlist in soup.findAll('a', {'class': 'houseListTitle'}):
            list.append(viewlist.get('href'))
        threads = []
        print(list)
        for viewurl in list:
            t = threading.Thread(target=ershoufang_spider, args=(db_ershoufang, viewurl))
            threads.append(t)
        for t in threads:
            t.start()
        for t in threads:
            t.join()
    except (urllib3.exceptions.HTTPError, urllib3.exceptions.NewConnectionError) as e:
        print
        e
        exit(-3)
    except Exception as e:
        print
        e
        exit(-4)

if __name__ == "__main__":
    command = "create table if not exists anjuhouse (xiaoqu TEXT, huxing TEXT, danjia TEXT, weizhi TEXT, mianji TEXT, shoufu TEXT, niandai TEXT, chaoxiang TEXT, yuegong TEXT, leixing TEXT, louceng TEXT, zhuangxiu TEXT, chanquan TEXT, dianti TEXT, nianxian TEXT, xingzhi TEXT, weiyi TEXT)"
    # command = "create table if not exists anjuhouse (xiaoqu TEXT, huxing TEXT)"
    db_ershoufang = SQLiteWraper('anjuke-ershoufang.db', command)

    for page in range(1, 50):
        db_ershoufang_spider(db_ershoufang, page + 1)
python-安居客-鄭州二手房銷售信息抓取

反向面試問題彙總

apache2.4添加對多版本php的支持

使用PHP創建Auth令牌

國產戰爭片老電影

大話程序員的最高境界：呆若木雞

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結