短租數據集分析
文章目錄
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
plt.rcParams['axes.unicode_minus']=False #用來正常顯示負號
%matplotlib inline
查看listings,calendar,reviews,neighbourhoods各表的基本內容
#房源基礎信息,包括房源、房東、位置、類型、價格、評論數量和可租時間等等。
listings=pd.read_csv('listings.csv')
listings.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44054 | Modern and Comfortable Living in CBD | 192875 | East Apartments | NaN | 朝陽區 / Chaoyang | 39.89503 | 116.45163 | Entire home/apt | 792 | 1 | 89 | 2019-03-04 | 0.85 | 9 | 341 |
1 | 100213 | The Great Wall Box Deluxe Suite A團園長城小院東院套房 | 527062 | Joe | NaN | 密雲縣 / Miyun | 40.68434 | 117.17231 | Private room | 1201 | 1 | 2 | 2017-10-08 | 0.10 | 4 | 0 |
2 | 128496 | Heart of Beijing: House with View 2 | 467520 | Cindy | NaN | 東城區 | 39.93213 | 116.42200 | Entire home/apt | 389 | 3 | 259 | 2019-02-05 | 2.70 | 1 | 93 |
3 | 161902 | cozy studio in center of Beijing | 707535 | Robert | NaN | 東城區 | 39.93357 | 116.43577 | Entire home/apt | 376 | 1 | 26 | 2016-12-03 | 0.28 | 5 | 290 |
4 | 162144 | nice studio near subway, sleep 4 | 707535 | Robert | NaN | 朝陽區 / Chaoyang | 39.93668 | 116.43798 | Entire home/apt | 537 | 1 | 37 | 2018-08-01 | 0.40 | 5 | 352 |
#房源時間表信息,包括房源、時間、是否可租、租金和可租天數等等。
calendar=pd.read_csv('calendar_detail.csv')
calendar.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | date | available | price | adjusted_price | minimum_nights | maximum_nights | |
---|---|---|---|---|---|---|---|
0 | 1165040 | 2019-04-17 | f | $511.00 | $511.00 | 1.0 | 1125.0 |
1 | 1165040 | 2019-04-18 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
2 | 1165040 | 2019-04-19 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
3 | 1165040 | 2019-04-20 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
4 | 1165040 | 2019-04-21 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
#北京的行政區劃
neighbour=pd.read_csv('neighbourhoods.csv')
neighbour.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
neighbourhood_group | neighbourhood | |
---|---|---|
0 | NaN | 東城區 |
1 | NaN | 豐臺區 / Fengtai |
2 | NaN | 大興區 / Daxing |
3 | NaN | 密雲縣 / Miyun |
4 | NaN | 平谷區 / Pinggu |
#房源的評論信息。包括房源 listing_id和評論日期,包括評論相關的內容和作者信息。
reviews=pd.read_csv('reviews_detail.csv')
reviews.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | id | date | reviewer_id | reviewer_name | comments | |
---|---|---|---|---|---|---|
0 | 44054 | 84748 | 2010-08-25 | 207019 | Jarrod | Sev was very helpful. Sev showed us where to ... |
1 | 44054 | 118384 | 2010-10-13 | 218723 | Kimberly | We arrived in Beijing very early in the mornin... |
2 | 44054 | 436978 | 2011-08-11 | 609177 | Emma | It is a really massive apartment and really co... |
3 | 44054 | 1118657 | 2012-04-12 | 1787536 | Andreyna | Sev was incredibly helpful, showed us around t... |
4 | 44054 | 2140650 | 2012-08-30 | 1179565 | Frances | The appartment was ideal for our party of 6 ad... |
對listings表數據清洗
listings.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44054 | Modern and Comfortable Living in CBD | 192875 | East Apartments | NaN | 朝陽區 / Chaoyang | 39.89503 | 116.45163 | Entire home/apt | 792 | 1 | 89 | 2019-03-04 | 0.85 | 9 | 341 |
1 | 100213 | The Great Wall Box Deluxe Suite A團園長城小院東院套房 | 527062 | Joe | NaN | 密雲縣 / Miyun | 40.68434 | 117.17231 | Private room | 1201 | 1 | 2 | 2017-10-08 | 0.10 | 4 | 0 |
2 | 128496 | Heart of Beijing: House with View 2 | 467520 | Cindy | NaN | 東城區 | 39.93213 | 116.42200 | Entire home/apt | 389 | 3 | 259 | 2019-02-05 | 2.70 | 1 | 93 |
3 | 161902 | cozy studio in center of Beijing | 707535 | Robert | NaN | 東城區 | 39.93357 | 116.43577 | Entire home/apt | 376 | 1 | 26 | 2016-12-03 | 0.28 | 5 | 290 |
4 | 162144 | nice studio near subway, sleep 4 | 707535 | Robert | NaN | 朝陽區 / Chaoyang | 39.93668 | 116.43798 | Entire home/apt | 537 | 1 | 37 | 2018-08-01 | 0.40 | 5 | 352 |
#對listings表分析
listings.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28452 entries, 0 to 28451
Data columns (total 16 columns):
id 28452 non-null int64
name 28451 non-null object
host_id 28452 non-null int64
host_name 28452 non-null object
neighbourhood_group 0 non-null float64
neighbourhood 28452 non-null object
latitude 28452 non-null float64
longitude 28452 non-null float64
room_type 28452 non-null object
price 28452 non-null int64
minimum_nights 28452 non-null int64
number_of_reviews 28452 non-null int64
last_review 17294 non-null object
reviews_per_month 17294 non-null float64
calculated_host_listings_count 28452 non-null int64
availability_365 28452 non-null int64
dtypes: float64(4), int64(7), object(5)
memory usage: 3.5+ MB
# 觀察發現listings表有幾個問題:
# 1.neighbourhood_group列存在很多空值,查看統計信息
# 2.neighbourhood列有中文有英文,決定刪掉‘/yingwen’,僅保留neighbourhood列中文部分
# 3.查看經緯度是否有異常值
# 4.查看房屋類型有多少種
# 5.查看價格是否存在異常值
# 6.查看最小入住天數是否有異常值
# 7.查看評論數前10的id
# 8.查看每月評論數前十的id
# 9.查看365天中天數是否有異常值
# 10.name,last_review和reviews_per_month中都存在空值,不過影響不大
#發現neighbourhood_group列有很多空值,查看neighbourhood_group列的統計信息
listings['neighbourhood_group'].describe()
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: neighbourhood_group, dtype: float64
由以上neighbourhood_group列的統計信息可知,neighbourhood_group列全爲空值,無意義
所以決定刪除在listings表中刪除neighbourhood_group列
#刪除neighbourhood_group列
listings=listings.drop(['neighbourhood_group'],axis=1)
listings.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44054 | Modern and Comfortable Living in CBD | 192875 | East Apartments | 朝陽區 / Chaoyang | 39.89503 | 116.45163 | Entire home/apt | 792 | 1 | 89 | 2019-03-04 | 0.85 | 9 | 341 |
1 | 100213 | The Great Wall Box Deluxe Suite A團園長城小院東院套房 | 527062 | Joe | 密雲縣 / Miyun | 40.68434 | 117.17231 | Private room | 1201 | 1 | 2 | 2017-10-08 | 0.10 | 4 | 0 |
2 | 128496 | Heart of Beijing: House with View 2 | 467520 | Cindy | 東城區 | 39.93213 | 116.42200 | Entire home/apt | 389 | 3 | 259 | 2019-02-05 | 2.70 | 1 | 93 |
3 | 161902 | cozy studio in center of Beijing | 707535 | Robert | 東城區 | 39.93357 | 116.43577 | Entire home/apt | 376 | 1 | 26 | 2016-12-03 | 0.28 | 5 | 290 |
4 | 162144 | nice studio near subway, sleep 4 | 707535 | Robert | 朝陽區 / Chaoyang | 39.93668 | 116.43798 | Entire home/apt | 537 | 1 | 37 | 2018-08-01 | 0.40 | 5 | 352 |
#查看neighbourhood列有哪幾種不同元素
listings['neighbourhood'].unique()
array(['朝陽區 / Chaoyang', '密雲縣 / Miyun', '東城區', '西城區', '海淀區',
'順義區 / Shunyi', '房山區', '懷柔區 / Huairou', '昌平區', '通州區 / Tongzhou',
'豐臺區 / Fengtai', '大興區 / Daxing', '延慶縣 / Yanqing', '石景山區',
'門頭溝區 / Mentougou', '平谷區 / Pinggu'], dtype=object)
#neighbourhood列刪掉‘/yingwen’,僅保留neighbourhood列中文部分
for i in range(len(listings)):
new_neighbourhood=listings['neighbourhood'][i].split('/')
listings.loc[i,'neighbourhood']=new_neighbourhood[0].strip()
listings['neighbourhood'].unique()
array(['朝陽區', '密雲縣', '東城區', '西城區', '海淀區', '順義區', '房山區', '懷柔區', '昌平區',
'通州區', '豐臺區', '大興區', '延慶縣', '石景山區', '門頭溝區', '平谷區'], dtype=object)
# 查看經緯度是否有異常值
listings['longitude'].describe()
count 28452.000000
mean 116.442000
std 0.204796
min 115.473390
25% 116.355283
50% 116.434665
75% 116.491122
max 117.495270
Name: longitude, dtype: float64
#經度和緯度的箱型圖
fig=plt.figure()
fig.add_subplot(121)
listings.boxplot(column='longitude')
fig.add_subplot(122)
listings.boxplot(column='latitude')
plt.show()
[外鏈圖片轉存失敗(img-lm4nPlmK-1569078306325)(output_18_0.png)]
#從經緯度的箱型圖看,考慮到北京城區面積大,異常點的誤差都在1°內,
# 所以把所有經緯度數據認爲是正常值範圍。
# 查看房屋類型有多少種
#房屋類型有三種,分別是Entire home/apt,Private room,Shared room
listings['room_type'].unique()
array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)
# 查看價格是否存在異常值
listings['price'].describe()
count 28452.000000
mean 611.203325
std 1623.535077
min 0.000000
25% 235.000000
50% 389.000000
75% 577.000000
max 68983.000000
Name: price, dtype: float64
fig=plt.figure()
fig.add_subplot(131)
listings[listings.room_type=='Entire home/apt'].boxplot(column='price')
plt.title('Entire home/apt')
fig.add_subplot(132)
listings[listings.room_type=='Private room'].boxplot(column='price')
plt.title('Private room')
fig.add_subplot(133)
listings[listings.room_type=='Shared room'].boxplot(column='price')
plt.title('Shared room')
plt.show()
[外鏈圖片轉存失敗(img-2PSdDcwj-1569078306326)(output_24_0.png)]
#查看價格爲0的房源基本信息,
listings[listings['price']==0]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5085 | 20670843 | 【衚衕老宅~輕語竹林房】旅遊絕佳地段】步行即到雍和宮、近故宮天安門、南鑼鼓巷、美食簋街 | 129840905 | Jing | 東城區 | 39.94292 | 116.41323 | Entire home/apt | 0 | 2 | 81 | 2019-03-31 | 4.09 | 8 | 27 |
5806 | 21246510 | 限時 北京二環四合院別墅拍攝聚會 商務會議 娛樂同仁堂老宅 近簋街、雍和宮、東直門、南鑼鼓巷... | 83233661 | Eva | 東城區 | 39.93677 | 116.42076 | Entire home/apt | 0 | 1 | 0 | NaN | NaN | 6 | 167 |
28234 | 33895187 | 測試房源mm2 | 185140389 | Ning Host | 朝陽區 | 39.98147 | 116.47109 | Private room | 0 | 1 | 0 | NaN | NaN | 2 | 359 |
#由名字可判斷這個價格肯定不會爲0,
#所以我把幾個房源的price改爲nan
listings.loc[listings['price']==0,'price']=np.nan
listings['price'].describe()
count 28449.000000
mean 611.267777
std 1623.608547
min 27.000000
25% 235.000000
50% 389.000000
75% 577.000000
max 68983.000000
Name: price, dtype: float64
#查看了一下50000以上的房源的基本信息
#我認爲因爲北京有很多四合院,所以50000以上應該也是存在的吧
#這裏不認爲50000以上是異常值了
listings[listings['price']>50000]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1067 | 12689987 | Artistic apartment with culture | 68973377 | 晨斌 | 朝陽區 | 39.92300 | 116.57996 | Entire home/apt | 67104.0 | 1 | 2 | 2016-06-03 | 0.06 | 2 | 365 |
2012 | 15488817 | Hotel apartment close to huge Mall | 68973377 | 晨斌 | 朝陽區 | 39.91962 | 116.59173 | Entire home/apt | 63346.0 | 1 | 16 | 2017-04-17 | 0.55 | 2 | 180 |
5167 | 20748712 | 大望路/九龍山大牀房 | 141070198 | 洋 | 朝陽區 | 39.88798 | 116.47667 | Entire home/apt | 59997.0 | 1 | 1 | 2017-09-05 | 0.05 | 3 | 91 |
6612 | 21942314 | 【溫馨小窩窩】近地鐵一號線五棵松/萬壽路,距離北京西站3站地,15分鐘。 | 48178909 | Qing | 海淀區 | 39.89523 | 116.28252 | Shared room | 59997.0 | 1 | 4 | 2018-03-23 | 0.24 | 1 | 180 |
10170 | 24994830 | 良鄉大學城兩室溫馨小屋 | 188806180 | 王 | 房山區 | 39.72157 | 116.15182 | Entire home/apt | 68828.0 | 1 | 1 | 2018-09-28 | 0.15 | 1 | 181 |
13668 | 27587044 | 房源已下架 | 208158466 | 晶 | 昌平區 | 40.08912 | 116.29895 | Private room | 66667.0 | 30 | 0 | NaN | NaN | 1 | 91 |
14697 | 28134193 | 此房不能租,不要詢問了 | 212328505 | 陳 | 海淀區 | 39.94947 | 116.36246 | Entire home/apt | 68983.0 | 1 | 1 | 2018-09-10 | 0.14 | 1 | 90 |
16207 | 28803519 | 【北京站地鐵3分鐘.故宮周邊最優惠.王府井商圈】溪流到家靜雅民宿 | 216392612 | 容 | 東城區 | 39.90583 | 116.42199 | Entire home/apt | 65970.0 | 1 | 1 | 2018-10-29 | 0.18 | 1 | 0 |
17083 | 29138170 | 全A小築 | 74938348 | 洋 | 朝陽區 | 39.89685 | 116.45925 | Entire home/apt | 59997.0 | 1 | 0 | NaN | NaN | 1 | 0 |
21809 | 31535043 | 水立方,鳥巢附近六人間男神牀位 | 236331220 | 王林 | 昌平區 | 40.07817 | 116.42163 | Shared room | 67909.0 | 1 | 0 | NaN | NaN | 2 | 180 |
listings['minimum_nights'].describe()
count 28452.000000
mean 2.729685
std 17.920932
min 1.000000
25% 1.000000
50% 1.000000
75% 1.000000
max 1125.000000
Name: minimum_nights, dtype: float64
#查看最小入住天數的箱型圖
listings.boxplot(column='minimum_nights')
plt.show()
[外鏈圖片轉存失敗(img-NiB0o4PZ-1569078306327)(output_30_0.png)]
# 查閱入住最小天數爲400天以上的情況
#經查,結合地理位置和房型等信息,入住最小天數爲400天以上是合理的
listings[listings['minimum_nights']>400]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1175 | 13183350 | 鳥巢旁歐式羅曼蒂克兩居 | 56183837 | Jack | 朝陽區 | 39.99724 | 116.40110 | Entire home/apt | 463.0 | 1000 | 2 | 2016-07-25 | 0.06 | 1 | 0 |
5609 | 21098578 | 6號14號線金臺路青旅燕兒窩上下鋪國貿CBD | 129486966 | 磊 | 朝陽區 | 39.91924 | 116.48459 | Shared room | 121.0 | 1124 | 4 | 2018-08-03 | 0.33 | 6 | 362 |
6505 | 21841908 | 鳥巢水立方朝南大主臥 | 159278266 | 小夢 | 海淀區 | 40.03241 | 116.36671 | Private room | 255.0 | 1000 | 0 | NaN | NaN | 1 | 0 |
19983 | 30752723 | 中關村新東方北大清華五道口頤和園圓明園魏公村 | 220133868 | 未知 | 海淀區 | 39.98541 | 116.31650 | Entire home/apt | 9998.0 | 1125 | 1 | 2019-01-29 | 0.38 | 2 | 0 |
25664 | 33297102 | 姚家園西里小區 | 250785867 | 子豪 | 朝陽區 | 39.94690 | 116.51349 | Shared room | 2798.0 | 500 | 0 | NaN | NaN | 1 | 365 |
#查看availability_365列
#由統計信息可知,availability_365的最大值是365,最小值是0是合理的
#所以判定availability_365列無異常值
listings['availability_365'].describe()
count 28452.000000
mean 220.342120
std 138.430677
min 0.000000
25% 87.000000
50% 209.000000
75% 361.000000
max 365.000000
Name: availability_365, dtype: float64
對listings表進行數據分析
#由經緯度查看房源的地理位置分佈情況
plt.scatter(x=listings['longitude'],y=listings['latitude'],alpha=0.1)
longitude_mean=listings['longitude'].mean()
latitude_mean=listings['latitude'].mean()
plt.scatter(x=longitude_mean,y=latitude_mean,c='r')
plt.text(longitude_mean, latitude_mean-0.1, ('%1.3f'%longitude_mean,'%1.3f'%latitude_mean),ha='center', va='bottom', fontsize=12)
plt.title('北京房源的地理位置分佈情況')
plt.xlabel('經度')
plt.ylabel('緯度')
plt.savefig('北京房源的地理位置分佈散點圖.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-wnd30ozn-1569078306327)(output_34_0.png)]
由散點圖可看出北京房源的地理位置分佈情況
其中經緯度的均值點爲(116.442,39.983),該點爲北京市朝陽區西壩河路附近
北京朝陽區的房源密度最高
#繪製北京各城區房源數量條形圖
plt.bar(listings['neighbourhood'].value_counts().index,listings['neighbourhood'].value_counts())
plt.title('北京各城區房源數量')
plt.xticks(listings['neighbourhood'].value_counts().index, listings['neighbourhood'].value_counts().index, rotation='45')
plt.savefig('北京各城區房源數量.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-TNNfZQMX-1569078306328)(output_36_0.png)]
listings['neighbourhood'].value_counts()
朝陽區 10810
東城區 3346
海淀區 3197
豐臺區 1758
西城區 1701
通州區 1290
昌平區 1034
密雲縣 935
順義區 920
懷柔區 833
大興區 823
延慶縣 718
房山區 579
石景山區 213
門頭溝區 152
平谷區 143
Name: neighbourhood, dtype: int64
由北京各城區房源數量條形圖可看出,北京朝陽區的房源數量最多,超過10000套,遠遠高於其他行政區,北京平谷區房源最少.
房源數量排名前三位的分別是朝陽區,東城區和海淀區.
listings['neighbourhood'].value_counts()
朝陽區 10810
東城區 3346
海淀區 3197
豐臺區 1758
西城區 1701
通州區 1290
昌平區 1034
密雲縣 935
順義區 920
懷柔區 833
大興區 823
延慶縣 718
房山區 579
石景山區 213
門頭溝區 152
平谷區 143
Name: neighbourhood, dtype: int64
#查看不同房型的房源數量
fig,ax1=plt.subplots()
t=listings['room_type'].value_counts().index
data1=listings['room_type'].value_counts()
data2=[listings.loc[listings.room_type=='Entire home/apt','price'].mean(),
listings.loc[listings.room_type=='Private room','price'].mean(),
listings.loc[listings.room_type=='Shared room','price'].mean()]
ax1.bar(t,data1,width=0.3)
ax1.set_ylabel('房源數量')
#爲每一個柱子添加數字標籤
for x,y in enumerate(listings['room_type'].value_counts()):
plt.text(x,y+200,y,ha='center')
ax2 = ax1.twinx()
#查看不同房型的平均價格
ax2.plot(t,data2,c='r')
ax2.set_ylabel('平均價格')
plt.show()
[外鏈圖片轉存失敗(img-yWhxoEAv-1569078306328)(output_40_0.png)]
# 查看北京不同地區不同房型的房源數量
listings['neighbourhood'].value_counts().index
labels=['朝陽區', '東城區', '海淀區', '豐臺區', '西城區', '通州區', '昌平區', '密雲縣', '順義區', '懷柔區',
'大興區', '延慶縣', '房山區', '石景山區', '門頭溝區', '平谷區']
# 房型
# typeroom=['Entire home/apt', 'Private room', 'Shared room']
x=np.arange(16)
# Entire home/apt
y1=listings[listings.room_type=='Entire home/apt']['neighbourhood'].value_counts().values
plt.bar(x,y1,width=0.25,label='Entire home/apt')
#Private room
y2=listings[listings.room_type=='Private room']['neighbourhood'].value_counts().values
plt.bar(x+0.25,y2,width=0.25,label='Private room')
#Shared room
y3=listings[listings.room_type=='Shared room']['neighbourhood'].value_counts().values
plt.bar(x+0.5,y3,width=0.25,label='Shared room')
plt.title('北京不同地區不同房型的房源數量')
plt.xticks(np.arange(16),labels,rotation='60')
plt.legend()
plt.savefig('北京各城區不同房型的房源數量.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-CB99ABrO-1569078306328)(output_41_0.png)]
# 北京不同城區不同房型平均價格折線圖
mm=listings.groupby(['neighbourhood','room_type'])['price'].mean()
xx=np.arange(16)
labels=['東城區','豐臺區','大興區','密雲縣','平谷區','延慶縣','懷柔區','房山區',
'昌平區','朝陽區','海淀區','石景山區','西城區','通州區','門頭溝區','順義區']
y11=[]
y22=[]
y33=[]
#entire room/apt
for i in range(16):
y11.append(mm[3*i])
#private room
for i in range(16):
y22.append(mm[3*i+1])
#shared_room
for i in range(16):
y33.append(mm[3*i+2])
plt.plot(xx,y11,label='Entire home/apt')
plt.plot(xx,y22,label='Private room')
plt.plot(xx,y33,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()
[外鏈圖片轉存失敗(img-nnLBtSZJ-1569078306329)(output_42_0.png)]
# 考慮到這個圖有部分城區的Shared room比Entire home/apt的平均價格還高,顯然是不合理的
# 所以我從從實際出發,取每種房型的25%-75%之間的租金圖,然後取平均數
# 北京不同城區不同房型平均價格條形圖
mm=listings.groupby(['neighbourhood','room_type'])['price'].mean()
xx=np.arange(16)
labels=['東城區','豐臺區','大興區','密雲縣','平谷區','延慶縣','懷柔區','房山區',
'昌平區','朝陽區','海淀區','石景山區','西城區','通州區','門頭溝區','順義區']
y11=[]
y22=[]
y33=[]
#entire room/apt
for i in range(16):
y11.append(mm[3*i])
#private room
for i in range(16):
y22.append(mm[3*i+1])
#shared_room
for i in range(16):
y33.append(mm[3*i+2])
plt.bar(xx,y11,width=0.1,label='Entire home/apt')
plt.bar(xx+0.1,y22,width=0.1,label='Private room')
plt.bar(xx+0.2,y33,width=0.1,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()
[外鏈圖片轉存失敗(img-nWVXDVbX-1569078306329)(output_44_0.png)]
考慮到這個圖有部分城區的Shared room比Entire home/apt的平均價格還高,顯然是不合理的
所以我從從實際出發,取箱型圖的中位數作爲各城區不同房型的價格參考標準
#這是不同房型的箱型圖
fig=plt.figure()
fig.add_subplot(131)
listings[listings.room_type=='Entire home/apt'].boxplot(column='price')
plt.title('Entire home/apt')
fig.add_subplot(132)
listings[listings.room_type=='Private room'].boxplot(column='price')
plt.title('Private room')
fig.add_subplot(133)
listings[listings.room_type=='Shared room'].boxplot(column='price')
plt.title('Shared room')
plt.savefig('不同房型價格分佈箱型圖.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-yV66TVHx-1569078306329)(output_46_0.png)]
#不同城區Entire home/apt的中位數
listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%']
neighbourhood
東城區 530.0
豐臺區 396.0
大興區 379.0
密雲縣 799.0
平谷區 819.0
延慶縣 1000.0
懷柔區 1678.0
房山區 282.0
昌平區 537.0
朝陽區 470.0
海淀區 490.0
石景山區 429.0
西城區 497.0
通州區 336.0
門頭溝區 289.0
順義區 396.0
Name: 50%, dtype: float64
#不同城區Private room的中位數
listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%']
neighbourhood
東城區 336.0
豐臺區 188.0
大興區 177.5
密雲縣 356.0
平谷區 382.0
延慶縣 497.0
懷柔區 537.0
房山區 201.0
昌平區 201.0
朝陽區 215.0
海淀區 242.0
石景山區 251.5
西城區 302.0
通州區 188.0
門頭溝區 899.0
順義區 255.0
Name: 50%, dtype: float64
#不同城區Shared room的中位數
listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%']
neighbourhood
東城區 107.0
豐臺區 127.0
大興區 148.0
密雲縣 148.0
平谷區 101.0
延慶縣 1188.0
懷柔區 886.0
房山區 174.0
昌平區 107.0
朝陽區 101.0
海淀區 101.0
石景山區 140.5
西城區 107.0
通州區 94.0
門頭溝區 94.0
順義區 157.5
Name: 50%, dtype: float64
# 北京地區各城區不同房型中位數價格分佈折線圖(這個圖太醜,捨棄)
xx=np.arange(16)
labels=['東城區','豐臺區','大興區','密雲縣','平谷區','延慶縣','懷柔區','房山區',
'昌平區','朝陽區','海淀區','石景山區','西城區','通州區','門頭溝區','順義區']
#不同城區Entire home/apt的中位數
y11=listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城區Private room的中位數
y22=listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城區Shared room的中位數
y33=listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%'].values
plt.plot(xx,y11,label='Entire home/apt')
plt.plot(xx,y22,label='Private room')
plt.plot(xx,y33,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()
[外鏈圖片轉存失敗(img-xiRIpoJM-1569078306330)(output_50_0.png)]
# 北京地區各城區不同房型中位數價格分佈
xx=np.arange(16)
labels=['東城區','豐臺區','大興區','密雲縣','平谷區','延慶縣','懷柔區','房山區',
'昌平區','朝陽區','海淀區','石景山區','西城區','通州區','門頭溝區','順義區']
#不同城區Entire home/apt的中位數
y11=listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城區Private room的中位數
y22=listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城區Shared room的中位數
y33=listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%'].values
plt.bar(xx,y11,width=0.2,label='Entire home/apt')
plt.bar(xx+0.2,y22,width=0.2,label='Private room')
plt.bar(xx+0.4,y33,width=0.2,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('北京地區不同城區不同房型中位數價格分佈')
plt.savefig('北京地區各城區不同房型中位數價格分佈.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-tqB1XZ1n-1569078306330)(output_51_0.png)]
懷柔區,延慶縣,平谷區,密雲縣的Entire home/apt的平均價格遠高於北京主城區,可能是因爲這些行政區內有多處度假村。
延慶縣,懷柔區的Shared room的價格高於Private room,可能是因爲他們Shared room的樣本數據太少,均只有三組數據。
門頭溝區的Private room價格遠高於Entire home/apt,可能是因爲門頭溝區的Private room的樣本數據較少,只有39組,且提供的數據中高租金價格佔比較多。
len(listings[(listings.neighbourhood=='密雲縣') & (listings.room_type=='Entire home/apt')]['price'])
496
jiagecanzhaobiao=listings.groupby(['neighbourhood','room_type'])['price'].describe()['50%']
jiagecanzhaobiao.to_excel('價格參照表.xlsx')
len(listings[(listings.neighbourhood=='密雲縣') & (listings.room_type=='Shared room')]['price'])
5
收集大客戶房東信息
#查看排名前十的房東信息,這是大客戶
dakehufangdong=listings.groupby(['host_id','host_name']).agg({'id':'count'}).sort_values(by='id',axis=0,ascending=False)[:20]
dakehufangdong.to_excel('大客戶房東信息表.xlsx')
# 查看第一名大客戶"美婷"的房源分佈情況
listings[listings.host_id==209669028]['neighbourhood'].value_counts()
朝陽區 178
東城區 44
Name: neighbourhood, dtype: int64
# 查看第二名大客戶"興偉"的房源分佈情況
listings[listings.host_id==54436429]['neighbourhood'].value_counts()
海淀區 137
朝陽區 57
東城區 15
豐臺區 1
Name: neighbourhood, dtype: int64
# 查看第三名大客戶"海梅"的房源分佈情況
listings[listings.host_id==156249912]['neighbourhood'].value_counts()
朝陽區 113
海淀區 2
Name: neighbourhood, dtype: int64
# 查看第四名大客戶"Cathy"的房源分佈情況
# listings[listings.host_id==17619297]
listings[listings.host_id==17619297]['neighbourhood'].value_counts()
海淀區 47
朝陽區 45
西城區 4
昌平區 1
東城區 1
Name: neighbourhood, dtype: int64
# 查看第三名大客戶"金桔精品民宿"的房源分佈情況
# listings[listings.host_id==156143513]
listings[listings.host_id==156143513]['neighbourhood'].value_counts()
通州區 54
順義區 19
朝陽區 4
Name: neighbourhood, dtype: int64
#繪製前三名客戶的房源分佈圖
x=np.arange(5)
labels=['美婷','興偉','海梅','Cathy','金桔精品民宿']
#海淀區
y1=[0,137,2,47,0]
plt.bar(x,y1,width=0.1,label='海淀區')
#朝陽區
y2=[178,57,113,45,4]
plt.bar(x+0.1,y2,width=0.1,label='朝陽區')
#東城區
y3=[44,15,0,1,0]
plt.bar(x+0.2,y3,width=0.1,label='東城區')
#豐臺區
y4=[0,1,0,0,0]
plt.bar(x+0.3,y4,width=0.1,label='豐臺區')
#昌平區
y5=[0,0,0,1,0]
plt.bar(x+0.4,y5,width=0.1,label='昌平區')
#西城區
y6=[0,0,0,4,0]
plt.bar(x+0.5,y6,width=0.1,label='西城區')
# 通州區
y7=[0,0,0,0,54]
plt.bar(x+0.6,y7,width=0.1,label='通州區')
# 順義區
y8=[0,0,0,0,19]
plt.bar(x+0.7,y8,width=0.1,label='順義區')
plt.xticks(np.arange(5),labels)
plt.legend()
plt.title('前五名大房東的房源分佈圖')
plt.savefig('前五名大房東的房源分佈圖.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-UlqCKdXy-1569078306330)(output_63_0.png)]
由上圖可以看出,房源數量排名前五的大房東中朝陽區和海淀區的房源數量最多,而且這些大房東的房源分佈通常在兩個到三個行政區.
對calendar表數據清洗
calendar.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | date | available | price | adjusted_price | minimum_nights | maximum_nights | |
---|---|---|---|---|---|---|---|
0 | 1165040 | 2019-04-17 | f | $511.00 | $511.00 | 1.0 | 1125.0 |
1 | 1165040 | 2019-04-18 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
2 | 1165040 | 2019-04-19 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
3 | 1165040 | 2019-04-20 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
4 | 1165040 | 2019-04-21 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
#查看calendar表
#刪除price和adjusted_price的美元符號
for i in range(len(calendar)):
new_price=calendar['price'].values[i].strip('$')
new_price=new_price.replace(',','')
calendar['price'].values[i]=float(new_price)
new_adjustedprice=calendar['adjusted_price'].values[i].strip('$')
new_adjustedprice=new_adjustedprice.replace(',','')
calendar['adjusted_price'].values[i]=float(new_adjustedprice)
#查看修改後的calendar的
calendar.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | date | available | price | adjusted_price | minimum_nights | maximum_nights | |
---|---|---|---|---|---|---|---|
0 | 1165040 | 2019-04-17 | f | 511 | 511 | 1.0 | 1125.0 |
1 | 1165040 | 2019-04-18 | t | 511 | 511 | 1.0 | 1125.0 |
2 | 1165040 | 2019-04-19 | t | 511 | 511 | 1.0 | 1125.0 |
3 | 1165040 | 2019-04-20 | t | 511 | 511 | 1.0 | 1125.0 |
4 | 1165040 | 2019-04-21 | t | 511 | 511 | 1.0 | 1125.0 |
查找calendar表指定數據
#假定自己是一名普通遊客,帶媽媽在朝陽區租一個房間private room,價格在300-1000左右,計劃入住三天,從2019-10-01開始入住,2019-10-04退房
#查看合適的房子
#want是listings表中符合要求的房源的基本信息,共計688家.
want=listings[(listings.room_type=='Private room')& (listings.price>300) &
(listings.price<1000) & (listings.availability_365>0)
& (listings.minimum_nights<4)&(listings.neighbourhood=='朝陽區')]
len(want)
688
#這是calendar表裏滿足listing_id在want表的id的信息,且10-1到10-4都是可租的
new1=calendar[(calendar.date=='2019-10-01')&(calendar.available=='t')&(calendar['listing_id'].isin(want['id'].values))]
new2=calendar[(calendar.date=='2019-10-02')&(calendar.available=='t')&(calendar['listing_id'].isin(new1['listing_id'].values))]
new3=calendar[(calendar.date=='2019-10-03')&(calendar.available=='t')&(calendar['listing_id'].isin(new2['listing_id'].values))]
new4=calendar[(calendar.date=='2019-10-04')&(calendar.available=='t')&(calendar['listing_id'].isin(new3['listing_id'].values))]
#查看calendar建成的表new4的統計信息
new4[['minimum_nights','maximum_nights']].describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
minimum_nights | maximum_nights | |
---|---|---|
count | 513.000000 | 513.000000 |
mean | 1.222222 | 939.087719 |
std | 1.168154 | 391.935275 |
min | 1.000000 | 1.000000 |
25% | 1.000000 | 1125.000000 |
50% | 1.000000 | 1125.000000 |
75% | 1.000000 | 1125.000000 |
max | 24.000000 | 1125.000000 |
#取new4表中最小晚數<=4的,最大晚數>=3的
new5=new4[(new4.minimum_nights<=4)&(new4.maximum_nights>=3)]
#new5即爲滿足要求的房源的時間信息表
對reviews表數據清洗
reviews.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | id | date | reviewer_id | reviewer_name | comments | |
---|---|---|---|---|---|---|
0 | 44054 | 84748 | 2010-08-25 | 207019 | Jarrod | Sev was very helpful. Sev showed us where to ... |
1 | 44054 | 118384 | 2010-10-13 | 218723 | Kimberly | We arrived in Beijing very early in the mornin... |
2 | 44054 | 436978 | 2011-08-11 | 609177 | Emma | It is a really massive apartment and really co... |
3 | 44054 | 1118657 | 2012-04-12 | 1787536 | Andreyna | Sev was incredibly helpful, showed us around t... |
4 | 44054 | 2140650 | 2012-08-30 | 1179565 | Frances | The appartment was ideal for our party of 6 ad... |
reviews.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202099 entries, 0 to 202098
Data columns (total 6 columns):
listing_id 202099 non-null int64
id 202099 non-null int64
date 202099 non-null object
reviewer_id 202099 non-null int64
reviewer_name 202093 non-null object
comments 201983 non-null object
dtypes: int64(3), object(3)
memory usage: 9.3+ MB
# 雖然reviewer_name有部分缺失,但是reviewer_id沒有缺失,所以沒有關係
# comments缺失也是可以接受的
#查找前20名評論次數最多的reviewer_id
top20_reviewers=reviews['reviewer_id'].value_counts()[:20]
top20_reviewers
186684246 43
21067785 35
158695647 34
99325050 32
149769588 26
140955472 26
213893643 24
6532783 23
229832388 23
196283240 23
28903457 23
117241519 21
104082034 21
3671922 21
165536239 20
16660997 20
10684339 20
228835331 20
43905550 19
50995265 19
Name: reviewer_id, dtype: int64
收集大客戶住戶信息
#創建dataframe topreviewer
#收集前20名評論次數最多的reviewer_id的基本信息,包括reviewer_id和reviewer_name和評論總條數
#創建dataframe topreviewer用來存放前20名評論次數最多的reviewer的信息
topreviewer=pd.DataFrame({'top_reviewer_id':np.arange(20),'top_reviewer_name':['none']*20,'sum_reviews':np.arange(20)})
#在topreviewer存放reviewer_id信息
topreviewer['top_reviewer_id']=top20_reviewers.index
#在topreviewer存放reviewer_name信息
for i in range(20):
name=reviews[reviews.reviewer_id ==top20_reviewers.index[i] ].reviewer_name.unique()
topreviewer.loc[i,'top_reviewer_name']=name
#在topreviewer存放sum_reviews信息
topreviewer['sum_reviews']=top20_reviewers.values
topreviewer
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
top_reviewer_id | top_reviewer_name | sum_reviews | |
---|---|---|---|
0 | 186684246 | Tomm | 43 |
1 | 21067785 | Jasmine | 35 |
2 | 158695647 | 未 | 34 |
3 | 99325050 | 新月 | 32 |
4 | 149769588 | 金龍 | 26 |
5 | 140955472 | Marines | 26 |
6 | 213893643 | 賽亞 | 24 |
7 | 6532783 | Dee | 23 |
8 | 229832388 | 星河 | 23 |
9 | 196283240 | 羊陽 | 23 |
10 | 28903457 | Yan | 23 |
11 | 117241519 | 蘭蘭 | 21 |
12 | 104082034 | Jonmiae | 21 |
13 | 3671922 | Kum Hong | 21 |
14 | 165536239 | Holm | 20 |
15 | 16660997 | Tao | 20 |
16 | 10684339 | Mia | 20 |
17 | 228835331 | Y | 20 |
18 | 43905550 | Salome | 19 |
19 | 50995265 | Bitong | 19 |
# 把topreviewer保存成表格topreviewer
topreviewer.to_excel('topreviewer.xlsx',index = False)
最受歡迎民宿特點
製作評論詞雲
#刪除評論中的‘/r/n’,並寫入file
file = open('comments1.txt','w',encoding='utf-8');
for i in range(len(reviews)):
str1=str(reviews.comments.values[i]).replace('\r\n','')
file.write(str1)
file.close()
#導入詞雲相關的庫
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
import jieba
import wordcloud
#繪製評論詞雲
file = open('comments1.txt','r',encoding='utf-8');
data = file.read() # 讀出數據
path_img='beijingmap.jpg'
background_image = np.array(Image.open(path_img))
w = wordcloud.WordCloud(font_path='./fonts/simhei.ttf',max_words=100,
background_color="white",
mask=background_image).generate(data)
image_colors = ImageColorGenerator(background_image)
# 下面代碼表示顯示圖片
plt.imshow(w.recolor(color_func=image_colors))
plt.axis("off")
plt.show()
w.to_file('comments.png')
[外鏈圖片轉存失敗(img-vIn6IWtk-1569078306331)(output_87_0.png)]
<wordcloud.wordcloud.WordCloud at 0x22c544f7e48>
從評論的詞雲圖裏可看出,旅客最看重交通便利,房東熱情,房屋乾淨這三點。
此外部分旅客還會關注設施齊全,牀舒服,離地鐵站近,有家的感覺等。
reviews.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | id | date | reviewer_id | reviewer_name | comments | |
---|---|---|---|---|---|---|
0 | 44054 | 84748 | 2010-08-25 | 207019 | Jarrod | Sev was very helpful. Sev showed us where to ... |
1 | 44054 | 118384 | 2010-10-13 | 218723 | Kimberly | We arrived in Beijing very early in the mornin... |
2 | 44054 | 436978 | 2011-08-11 | 609177 | Emma | It is a really massive apartment and really co... |
3 | 44054 | 1118657 | 2012-04-12 | 1787536 | Andreyna | Sev was incredibly helpful, showed us around t... |
4 | 44054 | 2140650 | 2012-08-30 | 1179565 | Frances | The appartment was ideal for our party of 6 ad... |
len(reviews)
202099
#評論表中不同房源出現次數
#np.array實現把index轉換爲數組
np.array(reviews['listing_id'].value_counts().index)
array([ 6622351, 6596814, 11911698, ..., 33781069, 28482595, 33261981],
dtype=int64)
# 取前5%
haofangzi_id=np.array(reviews['listing_id'].value_counts().index)[:865]
listings.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44054 | Modern and Comfortable Living in CBD | 192875 | East Apartments | 朝陽區 | 39.89503 | 116.45163 | Entire home/apt | 792.0 | 1 | 89 | 2019-03-04 | 0.85 | 9 | 341 |
1 | 100213 | The Great Wall Box Deluxe Suite A團園長城小院東院套房 | 527062 | Joe | 密雲縣 | 40.68434 | 117.17231 | Private room | 1201.0 | 1 | 2 | 2017-10-08 | 0.10 | 4 | 0 |
2 | 128496 | Heart of Beijing: House with View 2 | 467520 | Cindy | 東城區 | 39.93213 | 116.42200 | Entire home/apt | 389.0 | 3 | 259 | 2019-02-05 | 2.70 | 1 | 93 |
3 | 161902 | cozy studio in center of Beijing | 707535 | Robert | 東城區 | 39.93357 | 116.43577 | Entire home/apt | 376.0 | 1 | 26 | 2016-12-03 | 0.28 | 5 | 290 |
4 | 162144 | nice studio near subway, sleep 4 | 707535 | Robert | 朝陽區 | 39.93668 | 116.43798 | Entire home/apt | 537.0 | 1 | 37 | 2018-08-01 | 0.40 | 5 | 352 |
aneighbour=[]
broomtype=[]
for i in range(865):
m1=listings[listings.id==haofangzi_id[i]]['neighbourhood'].values[0]
aneighbour.append(m1)
m2=listings[listings.id==haofangzi_id[i]]['room_type'].values[0]
broomtype.append(m2)
# from collections import Counter
# Counter(aneighbour) 它是用來統計不同元素出現次數的方法
#np.unique(y,return_counts=True) 這個方法也是統計不同元素出現次數的方法
aaneighbour=np.unique(aneighbour,return_counts=True)
bbroomtype=np.unique( broomtype,return_counts=True)
plt.bar(aaneighbour[0],aaneighbour[1])
plt.xticks(rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('入住次數前5%的房源地區分佈')
plt.savefig('入住次數前5%的房源地區分佈.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-BSjHOqar-1569078306332)(output_97_0.png)]
# 從上圖可以看出,朝陽區和東城區的民宿入住需求最高。
plt.bar(bbroomtype[0],bbroomtype[1],width=0.25)
# plt.xticks(rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('入住次數前5%的房型分佈')
plt.savefig('入住次數前5%的房型分佈.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-74gelpwG-1569078306332)(output_99_0.png)]
# 入住次數前5%的房型分佈餅圖
plt.figure(figsize=(5,5))
values=bbroomtype[1].tolist()
labels=bbroomtype[0].tolist()
explode=[0.01,0.01,0.01]#設定各項距離圓心n個半徑
plt.pie(values,explode=explode,labels=labels,autopct='%1.1f%%',startangle=261)
plt.title('入住次數前5%的房型分佈餅圖')#繪製標題
plt.savefig('入住次數前5%的房型分佈餅圖',dpi=500,bbox_inches = 'tight')#保存圖片
plt.show()
[外鏈圖片轉存失敗(img-mEcm9b8L-1569078306332)(output_100_0.png)]
# 最受歡迎價格分佈
pprice=[]
for i in range(865):
m1=listings[listings.id==haofangzi_id[i]]['price'].values[0]
pprice.append(m1)
pprice
dandan=pd.DataFrame(pprice)
dandan.plot.box(title="入住次數前5%的價格分佈")
plt.grid(linestyle="--", alpha=0.3)
plt.savefig('入住次數前5%的價格分佈餅圖',dpi=500,bbox_inches = 'tight')
plt.show()
[外鏈圖片轉存失敗(img-urzMcN8y-1569078306333)(output_103_0.png)]
# 入住次數前5%的價格分佈散點圖
plt.scatter(x=pprice,y=np.arange(len(pprice)))
plt.show()
[外鏈圖片轉存失敗(img-YLRZC1YC-1569078306333)(output_104_0.png)]
dandan.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | |
---|---|
count | 864.000000 |
mean | 383.391204 |
std | 259.259195 |
min | 67.000000 |
25% | 201.000000 |
50% | 329.000000 |
75% | 483.000000 |
max | 2221.000000 |