詞頻統計

# pip  install bs4
from bs4 import BeautifulSoup   # python 爬蟲利器
"""
Beautiful Soup 是一個可以從HTML或XML文件中提取數據的Python庫.
它能夠通過你喜歡的轉換器實現慣用的文檔導航,查找,
修改文檔的方式.Beautiful Soup會幫你節省數小時甚至數天的工作時間.
"""
import requests
blog_url = 'https://blog.51cto.com/13118411/2154806'
data = requests.get(blog_url)
print(data)
print(data.text)
<Response [200]>
<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <link type="favicon" rel="shortcut icon" href="/favicon.ico" />
        <title>天氣預報定製-cooperfang的博客-51CTO博客</title>
    <meta name="keywords" content="requests,json">
<meta name="description" content="#apiaplicationprogramminginterface#不通軟件不同系統之間的功能相互調用#json是其中重要的一種數據交換形式#定製天氣預報https://www.sojson.com/open/api/weather/json.shtml?city=#http://jsonviewer.stack.hu/#https://www.sojson.com/open/api/weath">    <link href="https://static1.51cto.com/edu/blog/css/header.css?v=1.0.5.1" rel="stylesheet"><link href="https://static1.51cto.com/edu/blog/css/other.css?v=1.0.3.2" rel="stylesheet"><link href="https://static1.51cto.com/edu/blog/css/right.css?v=1.0.4.7" rel="stylesheet"><link href="https://static1.51cto.com/edu/blog/css/blog_details.css?v=1.0.7.1" rel="stylesheet"><link href="https://static1.51cto.com/edu/blog/css/highlight.css" rel="stylesheet">
    <script src="https://static1.51cto.com/edu/center/js/jquery.min.js"></script><script src="https://static1.51cto.com/edu/blog/js/cookie.js"></script><script src="https://static1.51cto.com/edu/blog/js/login.js?v=1.0.0.6"></script><script src="https://static1.51cto.com/edu/blog/js/common.js?v=1.0.0.8"></script><script src="https://static1.51cto.com/edu/blog/js/mbox.js"></script><script src="https://static1.51cto.com/edu/blog/js/follow.js?v=1.0.0.8"></script><script src="https://static1.51cto.com/edu/blog/js/vip.js?v=1.0.0.1"></script></head>
<body>
<img src="https://static1.51cto.com/edu/blog/mobile/images/share_default.jpg" border="0" style="width:0; height:0; position:absolute;">
<!--[if lt IE 9]>
  <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
  <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script>
<![endif]-->

<div class="Header">
  <div class="Page ">
    <h1 class="Logo"><a href="https://blog.51cto.com/">Logo</a></h1>
    <ul class="Navigates fl">
      <li ><a href="https://blog.51cto.com/">首頁</a></li>
      <li ><a href="https://blog.51cto.com/original">文章</a></li>
      <li ><a href="https://blog.51cto.com/blog/follow">關注</a></li>
      <li class="">
        <a class="column-stress" href="https://blog.51cto.com/cloumn/index">訂閱專欄<b></b></a>
      </li>
            <li class="">
        <a href="https://blog.51cto.com/expert">專家</a>
      </li>
          </ul>
    <ul class="Navigates Navigates-right fr">
      <li class="more maps">
        <a href="javascript:void(0);">網站導航</a>
        <div>
            <a href="http://edu.51cto.com" target="_blank">學院</a>
            <a href="https://blog.51cto.com" target="_blank">博客</a>
            <a href="http://down.51cto.com" target="_blank">下載</a>
            <a href="http://home.51cto.com" target="_blank">家園</a>
            <a href="http://bbs.51cto.com" target="_blank">論壇</a>
            <a href="http://x.51cto.com" target="_blank">CTO訓練營</a>
            <a href=" http://club.51cto.com?blog" target="_blank">CTO俱樂部</a>
            <a href="http://wot.51cto.com" target="_blank">WOT</a>
            <a href="http://www.51cto.com" target="_blank">51CTO</a>
            <i class="arrow"></i>
        </div>
      </li>
                <li><a href="http://home.51cto.com/user/register?reback=http%253A%252F%252Fblog.51cto.com%252F13118411%252F2154806" target="_self">註冊</a></li>
          <li class="login"><a href="/user/login?reback=http%3A%2F%2Fblog.51cto.com%2F13118411%2F2154806" target="_self">登錄</a></li>
                        <li class="mRead">
          <a href="javascript:;">手機閱讀</a>
          <div>
            <img src="https://static1.51cto.com/edu/blog/images/blog_qr.png">
            <p>掃一掃體驗手機閱讀</p>
            <i class="arrow"></i>
          </div>
        </li>
            <li class="search"><a href="https://blog.51cto.com/search/index"  target="_self">搜索</a></li>
                  <li class="write"><a href="javascript:;" onClick="Login()">寫文章</a></li>
                  </ul>

          <div class="clear"></div>
  </div>
</div>
<script>
    var isLogin = '0';
    var userId = '';
    var imgpath = 'https://s1.51cto.com/';
    var BLOG_URL = 'https://blog.51cto.com/';
    var msg_num_url = '/index/ajax-msg-num';
    $('.msg-follow, .msg-follow-max').eq(1).css({top: '91px'});
    $('.msg-follow, .msg-follow-max').eq(2).css({top: '121px'});
    setTimeout(function(){
            $.ajax({
                url:msg_num_url,
                type:"get",
                dataType:'json',
                success:function(res){
                    if(res.status == '0'){
                       //
                       var hasNewMsg = false;
                       if(res.data.msgNum > 0 && !$('#myMsg i').hasClass('dot')){
                            $('#myMsg i').addClass('dot');
                            hasNewMsg = true;
                       }
                       if(res.data.notifyNum > 0 && !$('#myNotify i').hasClass('dot')){
                           $('#myNotify i').addClass('dot');
                           hasNewMsg = true;
                       }
                       if(res.data.recommend_new > 0 && !$('#myRecommend i').hasClass('dot')){
                           $('#myRecommend i').addClass('dot');
                           hasNewMsg = true;
                       }
                       if(hasNewMsg && !$('#myAllMsg i').hasClass('dot')){
                           $('#myAllMsg i').addClass('dot');
                       }
                    }

                }
            });
    },70);
</script>
<div class="Content-box">
        <link rel="stylesheet" href="https://static1.51cto.com/edu/blog/css/mdeShow.css?v=1.0.0.9">
<link rel="stylesheet" href="https://static1.51cto.com/edu/blog/css/tinyscrollbar.css"/>
<script type="text/javascript" src="https://static1.51cto.com/edu/blog/js/jquery.tinyscrollbar.js"></script>
<div class="Content Index" style="padding-bottom: 50px;">
    <div class="Page M764">
        <!-- left start -->
        <div class="artical-Left-blog">
            <div class="status">
                                <a class="tab_name original">原創</a>
                            </div>
            <h1 class="artical-title">天氣預報定製</h1>
            <div class="artical-title-list">
                <div class="is-vip-bg-6 fl">
                    <a href="https://blog.51cto.com/13118411" class="a-img" target="_blank"><img class="is-vip-img is-vip-img-4" data-uid="13108411" src="http://ucenter.51cto.com/images/noavatar_middle.gif"></a>
                </div>
                <a href="https://blog.51cto.com/13118411" class="name fl" target="_blank">cooperfang</a>
                                <a class="comment comment-num fr"><font class="comment_number">0</font>人評論</a>
                <span class="fr"></span>
                <a href="javascript:;" class="read fr">124人閱讀</a>
                <a href="javascript:;" class="time fr">2018-08-04 22:59:05</a>
                <div class="clear"></div>
            </div>
                            <div class="artical-content-bak main-content">
                    <div class="con artical-content editor-preview-side"><pre><code class="language-python"># api aplication programming interface
# 不通軟件不同系統之間的功能相互調用
# json是其中重要的一種數據交換形式
# 定製天氣預報 https://www.sojson.com/open/api/weather/json.shtml?city=
# http://jsonviewer.stack.hu/
# https://www.sojson.com/open/api/weather/json.shtml

?city=%E5%8C%97%E4%BA%AC</code></pre>
<pre><code class="language-python">import requests # pip install requests 請求  網上api的調用形式
url = 'https://www.sojson.com/open/api/weather/json.shtml?city='
city = '北京'
ret = requests.get(url + city) # 請求的對象
print(ret.json())</code></pre>
<pre><code>{'date': '20180804', 'message': 'Success !', 'status': 200, 'city': '北京', 'count': 9, 'data': {'shidu': '70%', 'pm25': 44.0, 'pm10': 78.0, 'quality': '良', 'wendu': '30', 'ganmao': '極少數敏感人羣應減少戶外活動', 'yesterday': {'date': '03日星期五', 'sunrise': '05:13', 'high': '高溫 36.0℃', 'low': '低溫 26.0℃', 'sunset': '19:27', 'aqi': 107.0, 'fx': '南風', 'fl': '&lt;3級', 'type': '晴', 'notice': '願你擁有比陽光明媚的心情'}, 'forecast': [{'date': '04日星期六', 'sunrise': '05:14', 'high': '高溫 36.0℃', 'low': '低溫 27.0℃', 'sunset': '19:26', 'aqi': 97.0, 'fx': '南風', 'fl': '&lt;3級', 'type': '晴', 'notice': '願你擁有比陽光明媚的心情'}, {'date': '05日星期日', 'sunrise': '05:15', 'high': '高溫 35.0℃', 'low': '低溫 25.0℃', 'sunset': '19:25', 'aqi': 103.0, 'fx': '東南風', 'fl': '&lt;3級', 'type': '雷陣雨', 'notice': '帶好雨具,別在樹下躲雨'}, {'date': '06日星期一', 'sunrise': '05:16', 'high': '高溫 31.0℃', 'low': '低溫 25.0℃', 'sunset': '19:24', 'aqi': 97.0, 'fx': '南風', 'fl': '&lt;3級', 'type': '雷陣雨', 'notice': '帶好雨具,別在樹下躲雨'}, {'date': '07日星期二', 'sunrise': '05:17', 'high': '高溫 31.0℃', 'low': '低溫 25.0℃', 'sunset': '19:22', 'aqi': 113.0, 'fx': '西南風', 'fl': '&lt;3級', 'type': '雷陣雨', 'notice': '帶好雨具,別在樹下躲雨'}, {'date': '08日星期三', 'sunrise': '05:18', 'high': '高溫 30.0℃', 'low': '低溫 24.0℃', 'sunset': '19:21', 'aqi': 68.0, 'fx': '東南風', 'fl': '&lt;3級', 'type': '雷陣雨', 'notice': '帶好雨具,別在樹下躲雨'}]}}</code></pre>
<pre><code class="language-python"># 象字典一樣取值
d = ret.json()
# print(d['status'])
# print(d['city'])
# print(d['data'])
# print(d['data']['yesterday'])

def hot_weather(data):
    """定製化天氣預報"""
    try:
        weather_list = data['data']['forecast']
    #     print(weather_list)
        for day in weather_list:
            print(day['date'], day['high'], day['low'], day['sunset'], day['notice'])
    except Exception as e:
        print(e)
hot_weather(d)</code></pre>
<pre><code>04日星期六 高溫 36.0℃ 低溫 27.0℃ 19:26 願你擁有比陽光明媚的心情
05日星期日 高溫 35.0℃ 低溫 25.0℃ 19:25 帶好雨具,別在樹下躲雨
06日星期一 高溫 31.0℃ 低溫 25.0℃ 19:24 帶好雨具,別在樹下躲雨
07日星期二 高溫 31.0℃ 低溫 25.0℃ 19:22 帶好雨具,別在樹下躲雨
08日星期三 高溫 30.0℃ 低溫 24.0℃ 19:21 帶好雨具,別在樹下躲雨</code></pre>
<pre><code class="language-python">%cd D:\全棧\json api
d = ret.json()
import json
with open('weather.json', 'w') as f:
    json.dump(d, f)</code></pre>
<pre><code>D:\全棧\json api</code></pre></div>
                </div>
                                                    <div class="artical-copyright mt26">©著作權歸作者所有:來自51CTO博客作者cooperfang的原創作品,如需轉載,請註明出處,否則將追究法律責任</div>
                                    <div class="for-tag mt26">
                                                                                        <a href="https://blog.51cto.com/search/result?q=requests" target="_blank">requests</a>
                                                                                                <a href="https://blog.51cto.com/search/result?q=json" target="_blank">json</a>
                                                                                            <div class="clear"></div>
            </div>
            <div class="more-list">
                <p class="is-praise fl "><span type="1" blog_id="2154806" userid='13108411'>0</span></p>
                <div class="share-box fr">
                    <p class="share"><i></i>分享</p>
                    <div class="bdsharebuttonbox">
                      <span></span>
                      <a class="bds_tsina" data-cmd="tsina" >微博</a>
                      <a class="bds_sqq" data-cmd="sqq" >QQ</a>
                      <a class="bds_weixin" data-cmd="weixin" >微信</a>
                      <img src="/qr/qr-url?url=http%3A%2F%2Fblog.51cto.com%2F13118411%2F2154806">
                    </div>
                </div>
                <p class="favorites favorites-opt fr"><i></i>收藏</p>
                                <div class="clear"></div>
            </div>
                            <div class="artical-list">
                                    <a class="fl" href="https://blog.51cto.com/13118411/2154797" title="json">
                        上一篇:json</a>
                                                    <div class="clear"></div>
                </div>
                        <div class="author-module">
                <div class="is-vip-bg-6 fl">
                    <a href="https://blog.51cto.com/13118411" class="a-img" target="_blank">
                        <img class="is-vip-img is-vip-img-4" data-uid="13108411" src="http://ucenter.51cto.com/images/noavatar_middle.gif">
                    </a>
                </div>
                <div class="author-module-center fl">
                    <a class="h2" href="https://blog.51cto.com/13118411" target="_blank">cooperfang</a>
                    <h3>42篇文章,1W+人氣,0粉絲</h3>
                                    </div>
                                <div class="clear"></div>
            </div>
        </div>
        <div class="artical-Left" id="comment">
            <!-- 發佈評論 -->
            <div class="comment-creat">
                <div class="is-vip-bg-6 fl">
                    <a href="https://blog.51cto.com/13118411" class="header-img" target="_blank">
                        <img  src="http://ucenter.51cto.com/images/noavatar_middle.gif"/>
                    </a>
                </div>
                <div class="first-publish fr publish_user_id">
                    <textarea class="textareadiv textareadiv-publish" name="" id="" placeholder="提問和評論都可以,用心的回覆會被更多人看到和認可"  maxlength="500"></textarea>
                    <div class="comment-push">
                        <p class="msg fl">Ctrl+Enter&nbsp;發佈</p>
                                                    <p class="publish-btn blue-btn fr" flag="1">發佈</p>
                                                <p class="cancel-btn cancel-btn-1 fr">取消</p>
                        <div class="clear"></div>
                    </div>
                    <input type="hidden" class="user_id" value="13108411">
                    <input type="hidden" class="reply_id" value="2154806">
                    <input type="hidden" class="first_pid" value="">
                </div>
                <div class="clear"></div>
            </div>
                        <div class="commentList">
                        </div>
            <!-- page -->
            <div class="act_pageList_box"></div>
        </div>
        <!-- end left -->
        <!-- right start -->
        <div class="Blog-Right artical-Right">
            <a class="catalog"></a>
            <a class="scrollTop" href="javascript:void(0);" onclick="$(window).scrollTop(0);"></a>
        </div>
        <!-- end right  -->
    </div>
            <div class="special-column">
            <div class="Page M764">
                                    <div class="column-1">
                        <h2 class="column-tit">推薦專欄</h2>
                                                    <div class="column-box">
                                <a href="https://blog.51cto.com/cloumn/detail/13" class="a-img fl cloumn-tab-par" target="_blank">
                                    <img src="https://s1.51cto.com/images/blog/201808/03/a940c66317ecbe58436a2ad3831c2d7d.png">
                                                                            <span class="cloumn-tab-new cloumn-tab-new-1 cloumn-tab2 f12">上新</span>
                                                                    </a>
                                <div class="center fl">
                                    <a class="h2 white-space" href="https://blog.51cto.com/cloumn/detail/13" target="_blank">基於Python的DevOps實戰</a>
                                    <h3 class="white-space">運維開發全攻略</h3>
                                    <h4 class="white-space">共18章&nbsp;|&nbsp;<a href="https://blog.51cto.com/yuhongchun" target="_blank">撫琴煮酒</a></h4>
                                    <h5><span class="price">¥51.00</span><span>6人訂閱</span></h5>
                                </div>
                                <div class="right fr">
                                                                              <a class="cloumn-subscribe" cid="13" href="/cloumn/detail/13" ask='1' target="_blank">訂閱</a>
                                                                    </div>
                                <div class="clear"></div>
                            </div>
                                                    <div class="column-box">
                                <a href="https://blog.51cto.com/cloumn/detail/4" class="a-img fl cloumn-tab-par" target="_blank">
                                    <img src="https://s1.51cto.com/images/blog/201804/27/92f96bca4a81e7c15a63e696e2a8d8e5.jpg">
                                                                    </a>
                                <div class="center fl">
                                    <a class="h2 white-space" href="https://blog.51cto.com/cloumn/detail/4" target="_blank">微服務技術架構和大數據治理實戰</a>
                                    <h3 class="white-space">大數據時代的微服務之路</h3>
                                    <h4 class="white-space">共18章&nbsp;|&nbsp;<a href="https://blog.51cto.com/ityouknow" target="_blank">純潔微笑</a></h4>
                                    <h5><span class="price">¥51.00</span><span>496人訂閱</span></h5>
                                </div>
                                <div class="right fr">
                                                                              <a class="cloumn-subscribe" cid="4" href="/cloumn/detail/4" ask='1' target="_blank">訂閱</a>
                                                                    </div>
                                <div class="clear"></div>
                            </div>
                                            </div>
                                                    <div class="column-2" style="margin-top: 76px;">
                        <h2 class="column-tit">猜你喜歡</h2>
                        <div class="column-box">
                                                            <a class="white-space" href="https://blog.51cto.com/13118411/2154797?source=dra" target="_blank">json</a>
                                                            <a class="white-space" href="https://blog.51cto.com/13118411/2154710?source=dra" target="_blank">v0.35</a>
                                                            <a class="white-space" href="https://blog.51cto.com/laputaliya/536858?source=drt" target="_blank">JQuery ajax返回JSON時的處理方式</a>
                                                            <a class="white-space" href="https://blog.51cto.com/zhaojianping/629526?source=drt" target="_blank">android 讀取json數據(遍歷JSONObject和JSONArray)</a>
                                                            <a class="white-space" href="https://blog.51cto.com/huqilong/136802?source=drt" target="_blank">struts2 json jquery 集成詳解</a>
                                                            <a class="white-space" href="https://blog.51cto.com/12731497/2154195?source=drh" target="_blank">談談Python實戰數據可視化之pyplot模塊</a>
                                                            <a class="white-space" href="https://blog.51cto.com/13719825/2151358?source=drh" target="_blank">用爬蟲和Flask打造屬於自己的電影網站,完整教程送上!</a>
                                                            <a class="white-space" href="https://blog.51cto.com/lavenliu/2150518?source=drh" target="_blank">掌握面向對象編程本質,徹底掌握OOP</a>
                                                        <div class="clear"></div>
                        </div>
                    </div>
                            </div>
        </div>
        <div class="the-lowest-bg">
        <div class="the-lowest Page M764">
            <p class="is-praise fl "><span type="1" blog_id="2154806" userid='13108411'></span></p>
            <p class="b-favorites favorites-opt fl"><i></i><b>0</b></p>
            <a class="b-reply fl"><i></i><font class="comment_number"></font></a>
            <div class="b-share fl">
                <i></i>分享
                <div class="bdsharebuttonbox">
                  <a class="bds_tsina p2" data-cmd="tsina"></a>
                  <a class="bds_sqq p3" data-cmd="sqq"></a>
                  <a class="bds_weixin p1" data-cmd="weixin"><em class="code-icon"></em><img class="code-img" src="/qr/qr-url?url=http%3A%2F%2Fblog.51cto.com%2F13118411%2F2154806"></a>
                </div>
            </div>
                        <a href="https://blog.51cto.com/13118411" class="b-name fr">cooperfang</a>
            <div class="is-vip-bg-6 fr">
                <a href="https://blog.51cto.com/13118411" class="b-img"><img class="is-vip-img is-vip-img-4" data-uid="13108411" src="http://ucenter.51cto.com/images/noavatar_middle.gif"></a>
            </div>
            <div class="clear"></div>
        </div>
    </div>
</div>
<!-- 老博文美觀處理 -->
<script>
    var praise_url = 'https://blog.51cto.com/praise/praise'
        addReply_url = 'https://blog.51cto.com/comments/add'
        removeUrl = 'https://blog.51cto.com/comments/del'
        blog_id = '2154806'
        rid = '0'
        is_comment = '0'
        comment_list = '/blog/ajax-comment-list'
        comment_sort = "asc"
        index_url = 'https://blog.51cto.com/13118411';
        uc_url = 'http://ucenter.51cto.com/'
        blog_url = 'https://blog.51cto.com/'
        img_url = 'https://static1.51cto.com/edu/blog/'
        i_user_id = ''
        c_user_id ='13108411'
        collect_url = 'https://blog.51cto.com/collect/add'
        is_old = '0'
        nicknameurl = 'https://blog.51cto.com/13118411'
        nickname = 'cooperfang'
        myself = window.location.href
    $('.you-like-list li:odd').css({'margin-left': '10%'});
    $('.column-box a:odd').addClass('left-list')
    $('.myUrl').text(myself).click(function(){window.open(myself)})
    setTimeout(function(){$('.Footer').css({'margin-top':'-50px'})},50)
            if(is_old==1){SyntaxHighlighter.all();}
    window._bd_share_config={
    "common":{
      "bdText":"天氣預報定製",
      "bdDesc":$("#abstract_bdshare").text(),
      "bdMini":"2",
      "bdMiniList":false,
      "bdPic":"https://s1.51cto.com/images/201710/25/bd540a4f14d822f6e69087758699358b.jpg",
      "bdStyle":"0",
      "bdSize":"16"
    },
    "share":{}
  };
  with(document)0[(getElementsByTagName('head')[0]||body).appendChild(createElement('script')).src='http://bdimg.share.baidu.com/static/api/js/share.js?v=89860593.js?cdnversion='+~(-new Date()/36e5)];
  setTimeout(function(){
    $('.bdsharebuttonbox a').removeAttr('title')
  },1000)
</script>
</div>
<script src="https://static1.51cto.com/edu/blog/js/marked.min.js?v=1.0.0.5"></script><script src="https://static1.51cto.com/edu/blog/js/highlight.js"></script><script src="https://static1.51cto.com/edu/blog/js/detail_mp.js?v=2.0.1.7"></script><script src="https://static1.51cto.com/edu/blog/js/detail.js?v=1.0.6.9"></script><script src="https://static1.51cto.com/edu/blog/js/details_new.js?v=1.1.1"></script><script src="https://static1.51cto.com/edu/blog/js/copy.js?v=1.0.0.0"></script>    <script src="https://static1.51cto.com/edu/blog/js/pvlog.js"></script>
<script src="https://logs.51cto.com/rizhi/count/count.js"></script>
<script>
  $(".gotop").click(function(){$(window).scrollTop(0)})
</script>

    <script type="text/javascript">
    //百度統計代碼
    var _hmt = _hmt || [];
    (function() {
      var hm = document.createElement("script");
      hm.src = "https://hm.baidu.com/hm.js?2283d46608159c3b39fc9f1178809c21";
      var s = document.getElementsByTagName("script")[0];
      s.parentNode.insertBefore(hm, s);
    })();

    //自動推送鏈接
    (function(){
        var bp = document.createElement('script');
        var curProtocol = window.location.protocol.split(':')[0];
        if (curProtocol === 'https') {
            bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
        }
        else {
            bp.src = 'http://push.zhanzhang.baidu.com/push.js';
        }
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(bp, s);
    })();
      var _vds = _vds || [];
      window._vds = _vds;
      (function(){
        _vds.push(['setAccountId', '8c51975c40edfb67']);
        (function() {
          var vds = document.createElement('script');
          vds.type='text/javascript';
          vds.async = true;
          vds.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + 'assets.growingio.com/vds.js';
          var s = document.getElementsByTagName('script')[0];
          s.parentNode.insertBefore(vds, s);
        })();
      })();
      document.write(decodeURI("%3Cscript src='https://tongji.51cto.com/frontend/tj.gif' type='text/javascript'%3E%3C/script%3E"));
    </script>

<script>
  var uid = '';
  var BLOG_URL = 'https://blog.51cto.com/';
</script>
<script src="https://static1.51cto.com/edu//blog/js/jquery.cookie.js"></script>
<script src="https://static1.51cto.com/edu/blog/js/time-on-page.js?v=1.0.2" charset="utf-8"></script>
<script>
    (function(){
        var wh=$(window).height(),fh=$('.Footer').outerHeight(true),hh=$('.Header').outerHeight(true)
        $('.Content-box').css({'min-height': wh-fh-hh})
    })()
</script>
</body>
</html>
contents = BeautifulSoup(data.text, 'html.parser') # data.text博客文本,html.parser這個類自帶的功能
# print(contents)  輸出更標準化
all_p = contents.find_all('p')  # 尋找p標籤
all_text = ''
for p in all_p:
#     print(p.text)
    all_text += str(p.text)  # 拼接成一個句子
print(all_text)
掃一掃體驗手機閱讀0分享收藏Ctrl+Enter 發佈發佈取消0
# pip install jieba    對中文進行拆解爲獨立的詞語
import jieba
text = jieba.cut(all_text)  # jieba.cut() 
"""
Signature: jieba.cut(sentence, cut_all=False, HMM=True)
Docstring:
The main function that segments an entire sentence that contains
Chinese characters into seperated words.

"""
text_list= []
for t in text:
    print(t)
    text_list.append(t)
Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\coop\AppData\Local\Temp\jieba.cache
Loading model cost 1.107 seconds.
Prefix dict has been built succesfully.

掃一掃
體驗
手機
閱讀
0
分享
收藏
Ctrl
+
Enter
 
發佈
發佈
取消
0
import collections  # python 內置的api,以上jieba也可叫做api,收集
count = collections.Counter(text_list)   # 產生一個對象count
for key, val in count.most_common(30):
    # 有序(返回前n個出現次數最多的)
    print(key, val)
0 2
發佈 2
掃一掃 1
體驗 1
手機 1
閱讀 1
分享 1
收藏 1
Ctrl 1
+ 1
Enter 1
  1
取消 1
# 做接口  可以給被人這個py文件,也可以是個鏈接
import collections

def get_most_common(text_list, max_num = 30):
    """根據max_num取排名靠前的詞和出現次數"""
    ret = {'status':0, "statusText":'ok', 'data':{}}  # api通用格式
    try:
        new_list = list(text_list)
        count = collections.Counter(new_list)
        ret['data'] = count.most_common(max_num)
    except Exception as e:
        ret['status'] = 1
        ret['statusText'] = e
    return ret

get_most_common(text_list)
{'status': 0,
 'statusText': 'ok',
 'data': [('0', 2),
  ('發佈', 2),
  ('掃一掃', 1),
  ('體驗', 1),
  ('手機', 1),
  ('閱讀', 1),
  ('分享', 1),
  ('收藏', 1),
  ('Ctrl', 1),
  ('+', 1),
  ('Enter', 1),
  ('\xa0', 1),
  ('取消', 1)]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章