python3.6 編譯器pycharm
在本書8.1這章中,作者用2grams處理文件,但是程序不完全,在這裏用自己編寫的程序將其補充完整。
完整代碼如下:
from urllib.request import urlopen
import re
import string
import operator
def cleanInput(input):
input=re.sub('\n+',' ',input).lower()
input=re.sub('\[[0-9]*\]','',input)
input=re.sub(' +',' ',input)
input=bytes(input,'UTF-8')
input=input.decode('ascii','ignore')
cleanInput=[]
input=input.split(' ')
for item in input:
item=item.strip(string.punctuation)
if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
cleanInput.append(item)
return cleanInput
def ngrams(input,n):
input=cleanInput(input)
output={}
for i in range(len(input)-n+1):
ngramTemp=" ".join(input[i:i+n])
if ngramTemp not in output:
output[ngramTemp]=0
output[ngramTemp]+=1
return output
def isCommon(ngram):
commonWord=["the",'be','and','of','a','in','to','have','it',
'i','that','for','you','he','with','on','do','say','this',
'they','is','an','at','but','we','his','from','that','not',
'by','she','or','as','what','go','their','can','who','get',
'if','would','her','all','my','make','about','about','know','will',
'as','up','one','time','has','been','there','year','so',
'think','when','which','them','some','me','people','take',
'out','into','just','see','him','your','come','could','now',
'than','like','other','how','then','its','our','two','more',
'day','more','use','no','man','find','here','thing','give',
'many','well']
for word in ngram:
if word in commonWord:
return True
return False
def find_match(ngrams,res):#本段程序用於找出與selected_ngrams中的詞匹配的句子,存儲於字典中
match_para={}
for item in ngrams:
match_para[item[0]]=[]
single_word=item[0].split(' ')
for value in res:
if single_word[0] in value and single_word[1] in value:
match_para[item[0]].append(value)
return match_para
content=str(
urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt")
.read(),'utf-8')
ngrams=ngrams(content,2)
sortedNGrams=sorted(ngrams.items(),key=operator.itemgetter(1),reverse=True)
print(sortedNGrams)
print('the number of all 2-grams is:'+str(len(sortedNGrams)))
#---------------------
#本段程序用於排除常見的單詞構成的詞組
selected_ngrams=[]
for item in sortedNGrams:
flag=isCommon(item[0].split(' '))
if flag==False:
selected_ngrams.append(item)
print(selected_ngrams)
print('the number of the significant 2-grams is:'+str(len(selected_ngrams)))
#---------------------------------------------
#本段程序用於將原來的演講分解爲以句號,問好,感嘆號,分號爲結尾的句子,並全部改爲小寫並去除句前空白
prog=re.compile(r'[A-Za-z0-9 \':,\-]*[.?!;]{1}')
res=re.findall(prog,content)
res=[x.lower().strip() for x in res]
for value in res:
print(value,end='\n')
match_para=find_match(selected_ngrams,res)
print(match_para)