python 網絡數據採集第八章

python3.6 編譯器pycharm

在本書8.1這章中,作者用2grams處理文件,但是程序不完全,在這裏用自己編寫的程序將其補充完整。

完整代碼如下:

from urllib.request import urlopen
import re
import string
import operator


def cleanInput(input):
    input=re.sub('\n+',' ',input).lower()
    input=re.sub('\[[0-9]*\]','',input)
    input=re.sub(' +',' ',input)
    input=bytes(input,'UTF-8')
    input=input.decode('ascii','ignore')
    cleanInput=[]
    input=input.split(' ')
    for item in input:
        item=item.strip(string.punctuation)
        if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
            cleanInput.append(item)
    return cleanInput

def ngrams(input,n):
    input=cleanInput(input)
    output={}
    for i in range(len(input)-n+1):
        ngramTemp=" ".join(input[i:i+n])
        if ngramTemp not in output:
            output[ngramTemp]=0
        output[ngramTemp]+=1
    return output


def isCommon(ngram):
    commonWord=["the",'be','and','of','a','in','to','have','it',
                'i','that','for','you','he','with','on','do','say','this',
                'they','is','an','at','but','we','his','from','that','not',
                'by','she','or','as','what','go','their','can','who','get',
                'if','would','her','all','my','make','about','about','know','will',
                'as','up','one','time','has','been','there','year','so',
                'think','when','which','them','some','me','people','take',
                'out','into','just','see','him','your','come','could','now',
                'than','like','other','how','then','its','our','two','more',
                'day','more','use','no','man','find','here','thing','give',
                'many','well']
    for word in ngram:
        if word in commonWord:
            return True
    return False


def find_match(ngrams,res):#本段程序用於找出與selected_ngrams中的詞匹配的句子,存儲於字典中
    match_para={}
    for item in ngrams:
        match_para[item[0]]=[]
        single_word=item[0].split(' ')
        for value in res:
            if single_word[0] in value and single_word[1] in value:
                match_para[item[0]].append(value)
    return match_para


content=str(
    urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt")
        .read(),'utf-8')
ngrams=ngrams(content,2)
sortedNGrams=sorted(ngrams.items(),key=operator.itemgetter(1),reverse=True)
print(sortedNGrams)
print('the number of all 2-grams is:'+str(len(sortedNGrams)))
#---------------------
#本段程序用於排除常見的單詞構成的詞組
selected_ngrams=[]
for item in sortedNGrams:
    flag=isCommon(item[0].split(' '))
    if flag==False:
        selected_ngrams.append(item)
print(selected_ngrams)
print('the number of the significant 2-grams is:'+str(len(selected_ngrams)))
#---------------------------------------------
#本段程序用於將原來的演講分解爲以句號,問好,感嘆號,分號爲結尾的句子,並全部改爲小寫並去除句前空白
prog=re.compile(r'[A-Za-z0-9 \':,\-]*[.?!;]{1}')
res=re.findall(prog,content)
res=[x.lower().strip() for x in res]
for value in res:
    print(value,end='\n')
match_para=find_match(selected_ngrams,res)
print(match_para)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章