短租數據集分析

短租數據集分析

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
plt.rcParams['axes.unicode_minus']=False #用來正常顯示負號
%matplotlib inline

查看listings,calendar,reviews,neighbourhoods各表的基本內容

#房源基礎信息,包括房源、房東、位置、類型、價格、評論數量和可租時間等等。
listings=pd.read_csv('listings.csv')
listings.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 44054 Modern and Comfortable Living in CBD 192875 East Apartments NaN 朝陽區 / Chaoyang 39.89503 116.45163 Entire home/apt 792 1 89 2019-03-04 0.85 9 341
1 100213 The Great Wall Box Deluxe Suite A團園長城小院東院套房 527062 Joe NaN 密雲縣 / Miyun 40.68434 117.17231 Private room 1201 1 2 2017-10-08 0.10 4 0
2 128496 Heart of Beijing: House with View 2 467520 Cindy NaN 東城區 39.93213 116.42200 Entire home/apt 389 3 259 2019-02-05 2.70 1 93
3 161902 cozy studio in center of Beijing 707535 Robert NaN 東城區 39.93357 116.43577 Entire home/apt 376 1 26 2016-12-03 0.28 5 290
4 162144 nice studio near subway, sleep 4 707535 Robert NaN 朝陽區 / Chaoyang 39.93668 116.43798 Entire home/apt 537 1 37 2018-08-01 0.40 5 352
#房源時間表信息,包括房源、時間、是否可租、租金和可租天數等等。
calendar=pd.read_csv('calendar_detail.csv')
calendar.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
listing_id date available price adjusted_price minimum_nights maximum_nights
0 1165040 2019-04-17 f $511.00 $511.00 1.0 1125.0
1 1165040 2019-04-18 t $511.00 $511.00 1.0 1125.0
2 1165040 2019-04-19 t $511.00 $511.00 1.0 1125.0
3 1165040 2019-04-20 t $511.00 $511.00 1.0 1125.0
4 1165040 2019-04-21 t $511.00 $511.00 1.0 1125.0
#北京的行政區劃
neighbour=pd.read_csv('neighbourhoods.csv')
neighbour.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
neighbourhood_group neighbourhood
0 NaN 東城區
1 NaN 豐臺區 / Fengtai
2 NaN 大興區 / Daxing
3 NaN 密雲縣 / Miyun
4 NaN 平谷區 / Pinggu
#房源的評論信息。包括房源 listing_id和評論日期,包括評論相關的內容和作者信息。
reviews=pd.read_csv('reviews_detail.csv')
reviews.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
listing_id id date reviewer_id reviewer_name comments
0 44054 84748 2010-08-25 207019 Jarrod Sev was very helpful. Sev showed us where to ...
1 44054 118384 2010-10-13 218723 Kimberly We arrived in Beijing very early in the mornin...
2 44054 436978 2011-08-11 609177 Emma It is a really massive apartment and really co...
3 44054 1118657 2012-04-12 1787536 Andreyna Sev was incredibly helpful, showed us around t...
4 44054 2140650 2012-08-30 1179565 Frances The appartment was ideal for our party of 6 ad...

對listings表數據清洗

listings.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 44054 Modern and Comfortable Living in CBD 192875 East Apartments NaN 朝陽區 / Chaoyang 39.89503 116.45163 Entire home/apt 792 1 89 2019-03-04 0.85 9 341
1 100213 The Great Wall Box Deluxe Suite A團園長城小院東院套房 527062 Joe NaN 密雲縣 / Miyun 40.68434 117.17231 Private room 1201 1 2 2017-10-08 0.10 4 0
2 128496 Heart of Beijing: House with View 2 467520 Cindy NaN 東城區 39.93213 116.42200 Entire home/apt 389 3 259 2019-02-05 2.70 1 93
3 161902 cozy studio in center of Beijing 707535 Robert NaN 東城區 39.93357 116.43577 Entire home/apt 376 1 26 2016-12-03 0.28 5 290
4 162144 nice studio near subway, sleep 4 707535 Robert NaN 朝陽區 / Chaoyang 39.93668 116.43798 Entire home/apt 537 1 37 2018-08-01 0.40 5 352
#對listings表分析
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28452 entries, 0 to 28451
Data columns (total 16 columns):
id                                28452 non-null int64
name                              28451 non-null object
host_id                           28452 non-null int64
host_name                         28452 non-null object
neighbourhood_group               0 non-null float64
neighbourhood                     28452 non-null object
latitude                          28452 non-null float64
longitude                         28452 non-null float64
room_type                         28452 non-null object
price                             28452 non-null int64
minimum_nights                    28452 non-null int64
number_of_reviews                 28452 non-null int64
last_review                       17294 non-null object
reviews_per_month                 17294 non-null float64
calculated_host_listings_count    28452 non-null int64
availability_365                  28452 non-null int64
dtypes: float64(4), int64(7), object(5)
memory usage: 3.5+ MB

# 觀察發現listings表有幾個問題:
# 1.neighbourhood_group列存在很多空值,查看統計信息
# 2.neighbourhood列有中文有英文,決定刪掉‘/yingwen’,僅保留neighbourhood列中文部分
# 3.查看經緯度是否有異常值
# 4.查看房屋類型有多少種
# 5.查看價格是否存在異常值
# 6.查看最小入住天數是否有異常值
# 7.查看評論數前10的id
# 8.查看每月評論數前十的id
# 9.查看365天中天數是否有異常值
# 10.name,last_review和reviews_per_month中都存在空值,不過影響不大

#發現neighbourhood_group列有很多空值,查看neighbourhood_group列的統計信息
listings['neighbourhood_group'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: neighbourhood_group, dtype: float64

由以上neighbourhood_group列的統計信息可知,neighbourhood_group列全爲空值,無意義
所以決定刪除在listings表中刪除neighbourhood_group列

#刪除neighbourhood_group列
listings=listings.drop(['neighbourhood_group'],axis=1)
listings.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 44054 Modern and Comfortable Living in CBD 192875 East Apartments 朝陽區 / Chaoyang 39.89503 116.45163 Entire home/apt 792 1 89 2019-03-04 0.85 9 341
1 100213 The Great Wall Box Deluxe Suite A團園長城小院東院套房 527062 Joe 密雲縣 / Miyun 40.68434 117.17231 Private room 1201 1 2 2017-10-08 0.10 4 0
2 128496 Heart of Beijing: House with View 2 467520 Cindy 東城區 39.93213 116.42200 Entire home/apt 389 3 259 2019-02-05 2.70 1 93
3 161902 cozy studio in center of Beijing 707535 Robert 東城區 39.93357 116.43577 Entire home/apt 376 1 26 2016-12-03 0.28 5 290
4 162144 nice studio near subway, sleep 4 707535 Robert 朝陽區 / Chaoyang 39.93668 116.43798 Entire home/apt 537 1 37 2018-08-01 0.40 5 352
#查看neighbourhood列有哪幾種不同元素
listings['neighbourhood'].unique()

array(['朝陽區 / Chaoyang', '密雲縣 / Miyun', '東城區', '西城區', '海淀區',
       '順義區 / Shunyi', '房山區', '懷柔區 / Huairou', '昌平區', '通州區 / Tongzhou',
       '豐臺區 / Fengtai', '大興區 / Daxing', '延慶縣 / Yanqing', '石景山區',
       '門頭溝區 / Mentougou', '平谷區 / Pinggu'], dtype=object)

#neighbourhood列刪掉‘/yingwen’,僅保留neighbourhood列中文部分
for i in range(len(listings)):
    new_neighbourhood=listings['neighbourhood'][i].split('/')
    listings.loc[i,'neighbourhood']=new_neighbourhood[0].strip()

listings['neighbourhood'].unique()

array(['朝陽區', '密雲縣', '東城區', '西城區', '海淀區', '順義區', '房山區', '懷柔區', '昌平區',
       '通州區', '豐臺區', '大興區', '延慶縣', '石景山區', '門頭溝區', '平谷區'], dtype=object)

# 查看經緯度是否有異常值
listings['longitude'].describe()

count    28452.000000
mean       116.442000
std          0.204796
min        115.473390
25%        116.355283
50%        116.434665
75%        116.491122
max        117.495270
Name: longitude, dtype: float64

#經度和緯度的箱型圖
fig=plt.figure()
fig.add_subplot(121)
listings.boxplot(column='longitude')
fig.add_subplot(122)
listings.boxplot(column='latitude')
plt.show()

[外鏈圖片轉存失敗(img-lm4nPlmK-1569078306325)(output_18_0.png)]

#從經緯度的箱型圖看,考慮到北京城區面積大,異常點的誤差都在1°內,
# 所以把所有經緯度數據認爲是正常值範圍。

# 查看房屋類型有多少種
#房屋類型有三種,分別是Entire home/apt,Private room,Shared room
listings['room_type'].unique()

array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)

# 查看價格是否存在異常值
listings['price'].describe()

count    28452.000000
mean       611.203325
std       1623.535077
min          0.000000
25%        235.000000
50%        389.000000
75%        577.000000
max      68983.000000
Name: price, dtype: float64



fig=plt.figure()
fig.add_subplot(131)
listings[listings.room_type=='Entire home/apt'].boxplot(column='price')
plt.title('Entire home/apt')
fig.add_subplot(132)
listings[listings.room_type=='Private room'].boxplot(column='price')
plt.title('Private room')
fig.add_subplot(133)
listings[listings.room_type=='Shared room'].boxplot(column='price')
plt.title('Shared room')
plt.show()

[外鏈圖片轉存失敗(img-2PSdDcwj-1569078306326)(output_24_0.png)]

#查看價格爲0的房源基本信息,
listings[listings['price']==0]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
5085 20670843 【衚衕老宅~輕語竹林房】旅遊絕佳地段】步行即到雍和宮、近故宮天安門、南鑼鼓巷、美食簋街 129840905 Jing 東城區 39.94292 116.41323 Entire home/apt 0 2 81 2019-03-31 4.09 8 27
5806 21246510 限時 北京二環四合院別墅拍攝聚會 商務會議 娛樂同仁堂老宅 近簋街、雍和宮、東直門、南鑼鼓巷... 83233661 Eva 東城區 39.93677 116.42076 Entire home/apt 0 1 0 NaN NaN 6 167
28234 33895187 測試房源mm2 185140389 Ning Host 朝陽區 39.98147 116.47109 Private room 0 1 0 NaN NaN 2 359
#由名字可判斷這個價格肯定不會爲0,
#所以我把幾個房源的price改爲nan
listings.loc[listings['price']==0,'price']=np.nan


listings['price'].describe()

count    28449.000000
mean       611.267777
std       1623.608547
min         27.000000
25%        235.000000
50%        389.000000
75%        577.000000
max      68983.000000
Name: price, dtype: float64

#查看了一下50000以上的房源的基本信息
#我認爲因爲北京有很多四合院,所以50000以上應該也是存在的吧
#這裏不認爲50000以上是異常值了
listings[listings['price']>50000]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
1067 12689987 Artistic apartment with culture 68973377 晨斌 朝陽區 39.92300 116.57996 Entire home/apt 67104.0 1 2 2016-06-03 0.06 2 365
2012 15488817 Hotel apartment close to huge Mall 68973377 晨斌 朝陽區 39.91962 116.59173 Entire home/apt 63346.0 1 16 2017-04-17 0.55 2 180
5167 20748712 大望路/九龍山大牀房 141070198 朝陽區 39.88798 116.47667 Entire home/apt 59997.0 1 1 2017-09-05 0.05 3 91
6612 21942314 【溫馨小窩窩】近地鐵一號線五棵松/萬壽路,距離北京西站3站地,15分鐘。 48178909 Qing 海淀區 39.89523 116.28252 Shared room 59997.0 1 4 2018-03-23 0.24 1 180
10170 24994830 良鄉大學城兩室溫馨小屋 188806180 房山區 39.72157 116.15182 Entire home/apt 68828.0 1 1 2018-09-28 0.15 1 181
13668 27587044 房源已下架 208158466 昌平區 40.08912 116.29895 Private room 66667.0 30 0 NaN NaN 1 91
14697 28134193 此房不能租,不要詢問了 212328505 海淀區 39.94947 116.36246 Entire home/apt 68983.0 1 1 2018-09-10 0.14 1 90
16207 28803519 【北京站地鐵3分鐘.故宮周邊最優惠.王府井商圈】溪流到家靜雅民宿 216392612 東城區 39.90583 116.42199 Entire home/apt 65970.0 1 1 2018-10-29 0.18 1 0
17083 29138170 全A小築 74938348 朝陽區 39.89685 116.45925 Entire home/apt 59997.0 1 0 NaN NaN 1 0
21809 31535043 水立方,鳥巢附近六人間男神牀位 236331220 王林 昌平區 40.07817 116.42163 Shared room 67909.0 1 0 NaN NaN 2 180
listings['minimum_nights'].describe()

count    28452.000000
mean         2.729685
std         17.920932
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max       1125.000000
Name: minimum_nights, dtype: float64

#查看最小入住天數的箱型圖
listings.boxplot(column='minimum_nights')
plt.show()

[外鏈圖片轉存失敗(img-NiB0o4PZ-1569078306327)(output_30_0.png)]

# 查閱入住最小天數爲400天以上的情況
#經查,結合地理位置和房型等信息,入住最小天數爲400天以上是合理的
listings[listings['minimum_nights']>400]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
1175 13183350 鳥巢旁歐式羅曼蒂克兩居 56183837 Jack 朝陽區 39.99724 116.40110 Entire home/apt 463.0 1000 2 2016-07-25 0.06 1 0
5609 21098578 6號14號線金臺路青旅燕兒窩上下鋪國貿CBD 129486966 朝陽區 39.91924 116.48459 Shared room 121.0 1124 4 2018-08-03 0.33 6 362
6505 21841908 鳥巢水立方朝南大主臥 159278266 小夢 海淀區 40.03241 116.36671 Private room 255.0 1000 0 NaN NaN 1 0
19983 30752723 中關村新東方北大清華五道口頤和園圓明園魏公村 220133868 未知 海淀區 39.98541 116.31650 Entire home/apt 9998.0 1125 1 2019-01-29 0.38 2 0
25664 33297102 姚家園西里小區 250785867 子豪 朝陽區 39.94690 116.51349 Shared room 2798.0 500 0 NaN NaN 1 365
#查看availability_365列
#由統計信息可知,availability_365的最大值是365,最小值是0是合理的
#所以判定availability_365列無異常值
listings['availability_365'].describe()

count    28452.000000
mean       220.342120
std        138.430677
min          0.000000
25%         87.000000
50%        209.000000
75%        361.000000
max        365.000000
Name: availability_365, dtype: float64

對listings表進行數據分析

#由經緯度查看房源的地理位置分佈情況
plt.scatter(x=listings['longitude'],y=listings['latitude'],alpha=0.1)
longitude_mean=listings['longitude'].mean()
latitude_mean=listings['latitude'].mean()
plt.scatter(x=longitude_mean,y=latitude_mean,c='r')
plt.text(longitude_mean, latitude_mean-0.1, ('%1.3f'%longitude_mean,'%1.3f'%latitude_mean),ha='center', va='bottom', fontsize=12) 
plt.title('北京房源的地理位置分佈情況')
plt.xlabel('經度')
plt.ylabel('緯度')

plt.savefig('北京房源的地理位置分佈散點圖.png',dpi=500,bbox_inches = 'tight')
plt.show()



[外鏈圖片轉存失敗(img-wnd30ozn-1569078306327)(output_34_0.png)]

由散點圖可看出北京房源的地理位置分佈情況
其中經緯度的均值點爲(116.442,39.983),該點爲北京市朝陽區西壩河路附近
北京朝陽區的房源密度最高

#繪製北京各城區房源數量條形圖
plt.bar(listings['neighbourhood'].value_counts().index,listings['neighbourhood'].value_counts())
plt.title('北京各城區房源數量')
plt.xticks(listings['neighbourhood'].value_counts().index, listings['neighbourhood'].value_counts().index, rotation='45')

plt.savefig('北京各城區房源數量.png',dpi=500,bbox_inches = 'tight')
plt.show()



[外鏈圖片轉存失敗(img-TNNfZQMX-1569078306328)(output_36_0.png)]

listings['neighbourhood'].value_counts()

朝陽區     10810
東城區      3346
海淀區      3197
豐臺區      1758
西城區      1701
通州區      1290
昌平區      1034
密雲縣       935
順義區       920
懷柔區       833
大興區       823
延慶縣       718
房山區       579
石景山區      213
門頭溝區      152
平谷區       143
Name: neighbourhood, dtype: int64

由北京各城區房源數量條形圖可看出,北京朝陽區的房源數量最多,超過10000套,遠遠高於其他行政區,北京平谷區房源最少.
房源數量排名前三位的分別是朝陽區,東城區和海淀區.

listings['neighbourhood'].value_counts()

朝陽區     10810
東城區      3346
海淀區      3197
豐臺區      1758
西城區      1701
通州區      1290
昌平區      1034
密雲縣       935
順義區       920
懷柔區       833
大興區       823
延慶縣       718
房山區       579
石景山區      213
門頭溝區      152
平谷區       143
Name: neighbourhood, dtype: int64

#查看不同房型的房源數量
fig,ax1=plt.subplots()
t=listings['room_type'].value_counts().index
data1=listings['room_type'].value_counts()
data2=[listings.loc[listings.room_type=='Entire home/apt','price'].mean(),
  listings.loc[listings.room_type=='Private room','price'].mean(),
  listings.loc[listings.room_type=='Shared room','price'].mean()]
ax1.bar(t,data1,width=0.3)
ax1.set_ylabel('房源數量')

#爲每一個柱子添加數字標籤
for x,y in enumerate(listings['room_type'].value_counts()):
    plt.text(x,y+200,y,ha='center')
    
ax2 = ax1.twinx()
#查看不同房型的平均價格
ax2.plot(t,data2,c='r')
ax2.set_ylabel('平均價格')
plt.show()

[外鏈圖片轉存失敗(img-yWhxoEAv-1569078306328)(output_40_0.png)]

# 查看北京不同地區不同房型的房源數量
listings['neighbourhood'].value_counts().index
labels=['朝陽區', '東城區', '海淀區', '豐臺區', '西城區', '通州區', '昌平區', '密雲縣', '順義區', '懷柔區',
       '大興區', '延慶縣', '房山區', '石景山區', '門頭溝區', '平谷區']
# 房型
# typeroom=['Entire home/apt', 'Private room', 'Shared room']
x=np.arange(16)
# Entire home/apt
y1=listings[listings.room_type=='Entire home/apt']['neighbourhood'].value_counts().values
plt.bar(x,y1,width=0.25,label='Entire home/apt')
#Private room
y2=listings[listings.room_type=='Private room']['neighbourhood'].value_counts().values
plt.bar(x+0.25,y2,width=0.25,label='Private room')
#Shared room
y3=listings[listings.room_type=='Shared room']['neighbourhood'].value_counts().values
plt.bar(x+0.5,y3,width=0.25,label='Shared room')
plt.title('北京不同地區不同房型的房源數量')
plt.xticks(np.arange(16),labels,rotation='60')
plt.legend()

plt.savefig('北京各城區不同房型的房源數量.png',dpi=500,bbox_inches = 'tight')
plt.show()


[外鏈圖片轉存失敗(img-CB99ABrO-1569078306328)(output_41_0.png)]

# 北京不同城區不同房型平均價格折線圖
mm=listings.groupby(['neighbourhood','room_type'])['price'].mean()
xx=np.arange(16)
labels=['東城區','豐臺區','大興區','密雲縣','平谷區','延慶縣','懷柔區','房山區',
        '昌平區','朝陽區','海淀區','石景山區','西城區','通州區','門頭溝區','順義區']
y11=[]
y22=[]
y33=[]
#entire room/apt
for i in range(16):
    y11.append(mm[3*i])
#private room
for i in range(16):
    y22.append(mm[3*i+1])
#shared_room
for i in range(16):
    y33.append(mm[3*i+2])
plt.plot(xx,y11,label='Entire home/apt')
plt.plot(xx,y22,label='Private room')
plt.plot(xx,y33,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()

[外鏈圖片轉存失敗(img-nnLBtSZJ-1569078306329)(output_42_0.png)]

# 考慮到這個圖有部分城區的Shared room比Entire home/apt的平均價格還高,顯然是不合理的
# 所以我從從實際出發,取每種房型的25%-75%之間的租金圖,然後取平均數

# 北京不同城區不同房型平均價格條形圖
mm=listings.groupby(['neighbourhood','room_type'])['price'].mean()
xx=np.arange(16)
labels=['東城區','豐臺區','大興區','密雲縣','平谷區','延慶縣','懷柔區','房山區',
        '昌平區','朝陽區','海淀區','石景山區','西城區','通州區','門頭溝區','順義區']
y11=[]
y22=[]
y33=[]
#entire room/apt
for i in range(16):
    y11.append(mm[3*i])
#private room
for i in range(16):
    y22.append(mm[3*i+1])
#shared_room
for i in range(16):
    y33.append(mm[3*i+2])
plt.bar(xx,y11,width=0.1,label='Entire home/apt')
plt.bar(xx+0.1,y22,width=0.1,label='Private room')
plt.bar(xx+0.2,y33,width=0.1,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()


[外鏈圖片轉存失敗(img-nWVXDVbX-1569078306329)(output_44_0.png)]

考慮到這個圖有部分城區的Shared room比Entire home/apt的平均價格還高,顯然是不合理的
所以我從從實際出發,取箱型圖的中位數作爲各城區不同房型的價格參考標準

#這是不同房型的箱型圖
fig=plt.figure()
fig.add_subplot(131)
listings[listings.room_type=='Entire home/apt'].boxplot(column='price')
plt.title('Entire home/apt')
fig.add_subplot(132)
listings[listings.room_type=='Private room'].boxplot(column='price')
plt.title('Private room')
fig.add_subplot(133)
listings[listings.room_type=='Shared room'].boxplot(column='price')
plt.title('Shared room')

plt.savefig('不同房型價格分佈箱型圖.png',dpi=500,bbox_inches = 'tight')
plt.show()



[外鏈圖片轉存失敗(img-yV66TVHx-1569078306329)(output_46_0.png)]

#不同城區Entire home/apt的中位數
listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%']

neighbourhood
東城區      530.0
豐臺區      396.0
大興區      379.0
密雲縣      799.0
平谷區      819.0
延慶縣     1000.0
懷柔區     1678.0
房山區      282.0
昌平區      537.0
朝陽區      470.0
海淀區      490.0
石景山區     429.0
西城區      497.0
通州區      336.0
門頭溝區     289.0
順義區      396.0
Name: 50%, dtype: float64

#不同城區Private room的中位數
listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%']

neighbourhood
東城區     336.0
豐臺區     188.0
大興區     177.5
密雲縣     356.0
平谷區     382.0
延慶縣     497.0
懷柔區     537.0
房山區     201.0
昌平區     201.0
朝陽區     215.0
海淀區     242.0
石景山區    251.5
西城區     302.0
通州區     188.0
門頭溝區    899.0
順義區     255.0
Name: 50%, dtype: float64

#不同城區Shared room的中位數
listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%']


neighbourhood
東城區      107.0
豐臺區      127.0
大興區      148.0
密雲縣      148.0
平谷區      101.0
延慶縣     1188.0
懷柔區      886.0
房山區      174.0
昌平區      107.0
朝陽區      101.0
海淀區      101.0
石景山區     140.5
西城區      107.0
通州區       94.0
門頭溝區      94.0
順義區      157.5
Name: 50%, dtype: float64

# 北京地區各城區不同房型中位數價格分佈折線圖(這個圖太醜,捨棄)
xx=np.arange(16)
labels=['東城區','豐臺區','大興區','密雲縣','平谷區','延慶縣','懷柔區','房山區',
        '昌平區','朝陽區','海淀區','石景山區','西城區','通州區','門頭溝區','順義區']
#不同城區Entire home/apt的中位數
y11=listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城區Private room的中位數
y22=listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城區Shared room的中位數
y33=listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%'].values
plt.plot(xx,y11,label='Entire home/apt')
plt.plot(xx,y22,label='Private room')
plt.plot(xx,y33,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()

[外鏈圖片轉存失敗(img-xiRIpoJM-1569078306330)(output_50_0.png)]

# 北京地區各城區不同房型中位數價格分佈
xx=np.arange(16)
labels=['東城區','豐臺區','大興區','密雲縣','平谷區','延慶縣','懷柔區','房山區',
        '昌平區','朝陽區','海淀區','石景山區','西城區','通州區','門頭溝區','順義區']
#不同城區Entire home/apt的中位數
y11=listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城區Private room的中位數
y22=listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城區Shared room的中位數
y33=listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%'].values
plt.bar(xx,y11,width=0.2,label='Entire home/apt')
plt.bar(xx+0.2,y22,width=0.2,label='Private room')
plt.bar(xx+0.4,y33,width=0.2,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('北京地區不同城區不同房型中位數價格分佈')

plt.savefig('北京地區各城區不同房型中位數價格分佈.png',dpi=500,bbox_inches = 'tight')
plt.show()



[外鏈圖片轉存失敗(img-tqB1XZ1n-1569078306330)(output_51_0.png)]

懷柔區,延慶縣,平谷區,密雲縣的Entire home/apt的平均價格遠高於北京主城區,可能是因爲這些行政區內有多處度假村。
延慶縣,懷柔區的Shared room的價格高於Private room,可能是因爲他們Shared room的樣本數據太少,均只有三組數據。
門頭溝區的Private room價格遠高於Entire home/apt,可能是因爲門頭溝區的Private room的樣本數據較少,只有39組,且提供的數據中高租金價格佔比較多。

len(listings[(listings.neighbourhood=='密雲縣') & (listings.room_type=='Entire home/apt')]['price'])

496

jiagecanzhaobiao=listings.groupby(['neighbourhood','room_type'])['price'].describe()['50%']
jiagecanzhaobiao.to_excel('價格參照表.xlsx')

len(listings[(listings.neighbourhood=='密雲縣') & (listings.room_type=='Shared room')]['price'])

5

收集大客戶房東信息

#查看排名前十的房東信息,這是大客戶
dakehufangdong=listings.groupby(['host_id','host_name']).agg({'id':'count'}).sort_values(by='id',axis=0,ascending=False)[:20]
dakehufangdong.to_excel('大客戶房東信息表.xlsx')

# 查看第一名大客戶"美婷"的房源分佈情況
listings[listings.host_id==209669028]['neighbourhood'].value_counts()

朝陽區    178
東城區     44
Name: neighbourhood, dtype: int64

# 查看第二名大客戶"興偉"的房源分佈情況
listings[listings.host_id==54436429]['neighbourhood'].value_counts()

海淀區    137
朝陽區     57
東城區     15
豐臺區      1
Name: neighbourhood, dtype: int64

# 查看第三名大客戶"海梅"的房源分佈情況
listings[listings.host_id==156249912]['neighbourhood'].value_counts()

朝陽區    113
海淀區      2
Name: neighbourhood, dtype: int64

# 查看第四名大客戶"Cathy"的房源分佈情況
# listings[listings.host_id==17619297]
listings[listings.host_id==17619297]['neighbourhood'].value_counts()

海淀區    47
朝陽區    45
西城區     4
昌平區     1
東城區     1
Name: neighbourhood, dtype: int64

# 查看第三名大客戶"金桔精品民宿"的房源分佈情況
# listings[listings.host_id==156143513]
listings[listings.host_id==156143513]['neighbourhood'].value_counts()

通州區    54
順義區    19
朝陽區     4
Name: neighbourhood, dtype: int64

#繪製前三名客戶的房源分佈圖
x=np.arange(5)
labels=['美婷','興偉','海梅','Cathy','金桔精品民宿']
#海淀區
y1=[0,137,2,47,0]
plt.bar(x,y1,width=0.1,label='海淀區')
#朝陽區
y2=[178,57,113,45,4]
plt.bar(x+0.1,y2,width=0.1,label='朝陽區')
#東城區
y3=[44,15,0,1,0]
plt.bar(x+0.2,y3,width=0.1,label='東城區')
#豐臺區
y4=[0,1,0,0,0]
plt.bar(x+0.3,y4,width=0.1,label='豐臺區')
#昌平區
y5=[0,0,0,1,0]
plt.bar(x+0.4,y5,width=0.1,label='昌平區')
#西城區
y6=[0,0,0,4,0]
plt.bar(x+0.5,y6,width=0.1,label='西城區')
# 通州區
y7=[0,0,0,0,54]
plt.bar(x+0.6,y7,width=0.1,label='通州區')
# 順義區
y8=[0,0,0,0,19]
plt.bar(x+0.7,y8,width=0.1,label='順義區')

plt.xticks(np.arange(5),labels)
plt.legend()
plt.title('前五名大房東的房源分佈圖')

plt.savefig('前五名大房東的房源分佈圖.png',dpi=500,bbox_inches = 'tight')
plt.show()

[外鏈圖片轉存失敗(img-UlqCKdXy-1569078306330)(output_63_0.png)]

由上圖可以看出,房源數量排名前五的大房東中朝陽區和海淀區的房源數量最多,而且這些大房東的房源分佈通常在兩個到三個行政區.

對calendar表數據清洗

calendar.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

listing_id date available price adjusted_price minimum_nights maximum_nights
0 1165040 2019-04-17 f $511.00 $511.00 1.0 1125.0
1 1165040 2019-04-18 t $511.00 $511.00 1.0 1125.0
2 1165040 2019-04-19 t $511.00 $511.00 1.0 1125.0
3 1165040 2019-04-20 t $511.00 $511.00 1.0 1125.0
4 1165040 2019-04-21 t $511.00 $511.00 1.0 1125.0
#查看calendar表
#刪除price和adjusted_price的美元符號
for i in range(len(calendar)):
    new_price=calendar['price'].values[i].strip('$')
    new_price=new_price.replace(',','')
    calendar['price'].values[i]=float(new_price)
    
    new_adjustedprice=calendar['adjusted_price'].values[i].strip('$')
    new_adjustedprice=new_adjustedprice.replace(',','')
    calendar['adjusted_price'].values[i]=float(new_adjustedprice)

#查看修改後的calendar的
calendar.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

listing_id date available price adjusted_price minimum_nights maximum_nights
0 1165040 2019-04-17 f 511 511 1.0 1125.0
1 1165040 2019-04-18 t 511 511 1.0 1125.0
2 1165040 2019-04-19 t 511 511 1.0 1125.0
3 1165040 2019-04-20 t 511 511 1.0 1125.0
4 1165040 2019-04-21 t 511 511 1.0 1125.0

查找calendar表指定數據

#假定自己是一名普通遊客,帶媽媽在朝陽區租一個房間private room,價格在300-1000左右,計劃入住三天,從2019-10-01開始入住,2019-10-04退房
#查看合適的房子
#want是listings表中符合要求的房源的基本信息,共計688家.
want=listings[(listings.room_type=='Private room')& (listings.price>300) & 
              (listings.price<1000) & (listings.availability_365>0)
             & (listings.minimum_nights<4)&(listings.neighbourhood=='朝陽區')]
len(want)

688

#這是calendar表裏滿足listing_id在want表的id的信息,且10-1到10-4都是可租的
new1=calendar[(calendar.date=='2019-10-01')&(calendar.available=='t')&(calendar['listing_id'].isin(want['id'].values))]
new2=calendar[(calendar.date=='2019-10-02')&(calendar.available=='t')&(calendar['listing_id'].isin(new1['listing_id'].values))]
new3=calendar[(calendar.date=='2019-10-03')&(calendar.available=='t')&(calendar['listing_id'].isin(new2['listing_id'].values))]
new4=calendar[(calendar.date=='2019-10-04')&(calendar.available=='t')&(calendar['listing_id'].isin(new3['listing_id'].values))]


#查看calendar建成的表new4的統計信息
new4[['minimum_nights','maximum_nights']].describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

minimum_nights maximum_nights
count 513.000000 513.000000
mean 1.222222 939.087719
std 1.168154 391.935275
min 1.000000 1.000000
25% 1.000000 1125.000000
50% 1.000000 1125.000000
75% 1.000000 1125.000000
max 24.000000 1125.000000
#取new4表中最小晚數<=4的,最大晚數>=3的
new5=new4[(new4.minimum_nights<=4)&(new4.maximum_nights>=3)]
#new5即爲滿足要求的房源的時間信息表

對reviews表數據清洗

reviews.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

listing_id id date reviewer_id reviewer_name comments
0 44054 84748 2010-08-25 207019 Jarrod Sev was very helpful. Sev showed us where to ...
1 44054 118384 2010-10-13 218723 Kimberly We arrived in Beijing very early in the mornin...
2 44054 436978 2011-08-11 609177 Emma It is a really massive apartment and really co...
3 44054 1118657 2012-04-12 1787536 Andreyna Sev was incredibly helpful, showed us around t...
4 44054 2140650 2012-08-30 1179565 Frances The appartment was ideal for our party of 6 ad...
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202099 entries, 0 to 202098
Data columns (total 6 columns):
listing_id       202099 non-null int64
id               202099 non-null int64
date             202099 non-null object
reviewer_id      202099 non-null int64
reviewer_name    202093 non-null object
comments         201983 non-null object
dtypes: int64(3), object(3)
memory usage: 9.3+ MB

# 雖然reviewer_name有部分缺失,但是reviewer_id沒有缺失,所以沒有關係
# comments缺失也是可以接受的

#查找前20名評論次數最多的reviewer_id
top20_reviewers=reviews['reviewer_id'].value_counts()[:20]
top20_reviewers

186684246    43
21067785     35
158695647    34
99325050     32
149769588    26
140955472    26
213893643    24
6532783      23
229832388    23
196283240    23
28903457     23
117241519    21
104082034    21
3671922      21
165536239    20
16660997     20
10684339     20
228835331    20
43905550     19
50995265     19
Name: reviewer_id, dtype: int64

收集大客戶住戶信息

#創建dataframe topreviewer
#收集前20名評論次數最多的reviewer_id的基本信息,包括reviewer_id和reviewer_name和評論總條數

#創建dataframe topreviewer用來存放前20名評論次數最多的reviewer的信息
topreviewer=pd.DataFrame({'top_reviewer_id':np.arange(20),'top_reviewer_name':['none']*20,'sum_reviews':np.arange(20)})

#在topreviewer存放reviewer_id信息
topreviewer['top_reviewer_id']=top20_reviewers.index
#在topreviewer存放reviewer_name信息
for i in range(20):
    name=reviews[reviews.reviewer_id ==top20_reviewers.index[i] ].reviewer_name.unique()
    topreviewer.loc[i,'top_reviewer_name']=name
#在topreviewer存放sum_reviews信息
topreviewer['sum_reviews']=top20_reviewers.values

topreviewer

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

top_reviewer_id top_reviewer_name sum_reviews
0 186684246 Tomm 43
1 21067785 Jasmine 35
2 158695647 34
3 99325050 新月 32
4 149769588 金龍 26
5 140955472 Marines 26
6 213893643 賽亞 24
7 6532783 Dee 23
8 229832388 星河 23
9 196283240 羊陽 23
10 28903457 Yan 23
11 117241519 蘭蘭 21
12 104082034 Jonmiae 21
13 3671922 Kum Hong 21
14 165536239 Holm 20
15 16660997 Tao 20
16 10684339 Mia 20
17 228835331 Y 20
18 43905550 Salome 19
19 50995265 Bitong 19
# 把topreviewer保存成表格topreviewer
topreviewer.to_excel('topreviewer.xlsx',index = False)

最受歡迎民宿特點

製作評論詞雲

#刪除評論中的‘/r/n’,並寫入file
file = open('comments1.txt','w',encoding='utf-8');
for i in range(len(reviews)):
    str1=str(reviews.comments.values[i]).replace('\r\n','')
    file.write(str1)
file.close()

#導入詞雲相關的庫
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
import  jieba
import wordcloud

#繪製評論詞雲
file = open('comments1.txt','r',encoding='utf-8');
data = file.read() # 讀出數據
path_img='beijingmap.jpg'
background_image = np.array(Image.open(path_img))
w = wordcloud.WordCloud(font_path='./fonts/simhei.ttf',max_words=100,
                         background_color="white",
                       mask=background_image).generate(data)
image_colors = ImageColorGenerator(background_image)
   # 下面代碼表示顯示圖片
plt.imshow(w.recolor(color_func=image_colors))
plt.axis("off")


plt.show()

w.to_file('comments.png')

[外鏈圖片轉存失敗(img-vIn6IWtk-1569078306331)(output_87_0.png)]

<wordcloud.wordcloud.WordCloud at 0x22c544f7e48>

從評論的詞雲圖裏可看出,旅客最看重交通便利,房東熱情,房屋乾淨這三點。
此外部分旅客還會關注設施齊全,牀舒服,離地鐵站近,有家的感覺等。

reviews.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

listing_id id date reviewer_id reviewer_name comments
0 44054 84748 2010-08-25 207019 Jarrod Sev was very helpful. Sev showed us where to ...
1 44054 118384 2010-10-13 218723 Kimberly We arrived in Beijing very early in the mornin...
2 44054 436978 2011-08-11 609177 Emma It is a really massive apartment and really co...
3 44054 1118657 2012-04-12 1787536 Andreyna Sev was incredibly helpful, showed us around t...
4 44054 2140650 2012-08-30 1179565 Frances The appartment was ideal for our party of 6 ad...
len(reviews)

202099

#評論表中不同房源出現次數
#np.array實現把index轉換爲數組
np.array(reviews['listing_id'].value_counts().index)


array([ 6622351,  6596814, 11911698, ..., 33781069, 28482595, 33261981],
      dtype=int64)

# 取前5%
haofangzi_id=np.array(reviews['listing_id'].value_counts().index)[:865]


listings.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 44054 Modern and Comfortable Living in CBD 192875 East Apartments 朝陽區 39.89503 116.45163 Entire home/apt 792.0 1 89 2019-03-04 0.85 9 341
1 100213 The Great Wall Box Deluxe Suite A團園長城小院東院套房 527062 Joe 密雲縣 40.68434 117.17231 Private room 1201.0 1 2 2017-10-08 0.10 4 0
2 128496 Heart of Beijing: House with View 2 467520 Cindy 東城區 39.93213 116.42200 Entire home/apt 389.0 3 259 2019-02-05 2.70 1 93
3 161902 cozy studio in center of Beijing 707535 Robert 東城區 39.93357 116.43577 Entire home/apt 376.0 1 26 2016-12-03 0.28 5 290
4 162144 nice studio near subway, sleep 4 707535 Robert 朝陽區 39.93668 116.43798 Entire home/apt 537.0 1 37 2018-08-01 0.40 5 352
aneighbour=[]
broomtype=[]
for i in range(865):
    m1=listings[listings.id==haofangzi_id[i]]['neighbourhood'].values[0]
    aneighbour.append(m1)
    m2=listings[listings.id==haofangzi_id[i]]['room_type'].values[0]
    broomtype.append(m2)

# from collections import Counter
# Counter(aneighbour) 它是用來統計不同元素出現次數的方法
#np.unique(y,return_counts=True) 這個方法也是統計不同元素出現次數的方法
aaneighbour=np.unique(aneighbour,return_counts=True)
bbroomtype=np.unique( broomtype,return_counts=True)

plt.bar(aaneighbour[0],aaneighbour[1])

plt.xticks(rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('入住次數前5%的房源地區分佈')
plt.savefig('入住次數前5%的房源地區分佈.png',dpi=500,bbox_inches = 'tight')
plt.show()


[外鏈圖片轉存失敗(img-BSjHOqar-1569078306332)(output_97_0.png)]

# 從上圖可以看出,朝陽區和東城區的民宿入住需求最高。

plt.bar(bbroomtype[0],bbroomtype[1],width=0.25)

# plt.xticks(rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('入住次數前5%的房型分佈')
plt.savefig('入住次數前5%的房型分佈.png',dpi=500,bbox_inches = 'tight')
plt.show()

[外鏈圖片轉存失敗(img-74gelpwG-1569078306332)(output_99_0.png)]

# 入住次數前5%的房型分佈餅圖
plt.figure(figsize=(5,5))
values=bbroomtype[1].tolist()
labels=bbroomtype[0].tolist()
explode=[0.01,0.01,0.01]#設定各項距離圓心n個半徑
plt.pie(values,explode=explode,labels=labels,autopct='%1.1f%%',startangle=261)
plt.title('入住次數前5%的房型分佈餅圖')#繪製標題
plt.savefig('入住次數前5%的房型分佈餅圖',dpi=500,bbox_inches = 'tight')#保存圖片
plt.show()

[外鏈圖片轉存失敗(img-mEcm9b8L-1569078306332)(output_100_0.png)]

# 最受歡迎價格分佈
pprice=[]
for i in range(865):
    m1=listings[listings.id==haofangzi_id[i]]['price'].values[0]
    pprice.append(m1)


pprice

dandan=pd.DataFrame(pprice)
dandan.plot.box(title="入住次數前5%的價格分佈")
plt.grid(linestyle="--", alpha=0.3)
plt.savefig('入住次數前5%的價格分佈餅圖',dpi=500,bbox_inches = 'tight')
plt.show()

[外鏈圖片轉存失敗(img-urzMcN8y-1569078306333)(output_103_0.png)]

# 入住次數前5%的價格分佈散點圖
plt.scatter(x=pprice,y=np.arange(len(pprice)))
plt.show()

[外鏈圖片轉存失敗(img-YLRZC1YC-1569078306333)(output_104_0.png)]

dandan.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

0
count 864.000000
mean 383.391204
std 259.259195
min 67.000000
25% 201.000000
50% 329.000000
75% 483.000000
max 2221.000000
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章