Python(2.7.x)實現簡單的單詞頻數統計

1. word_stat.py

# encoding: utf-8 
import re
import os
from sys import argv

# 將文本分割成單詞列表
def split_str(text):
	return re.split(r"\W+", text.strip())

# 字典排序
def sort_dict(d):
	return sorted(d.items(), key = lambda d:d[1], reverse = True)

# 字典過濾
def filter_dict(d, limit = 0):
	for (k, v) in d.items():
		if v <= limit:
			d.pop(k)

# 單詞過濾
def filter_word(word, minlen = 3, maxlen = 20):
	global filterwords
	if len(word) < minlen or len(word) > maxlen:
		return True
	if word.isdigit():
		return True
	if word in filterwords:
		return True

# 單詞統計
def stat_words(words, word_dict):
	for word in words:
		word = word.lower()
		if filter_word(word):
			continue
		if word in word_dict:
			word_dict[word] += 1
		else:
			word_dict[word] = 1
	return word_dict

# 處理文本行
def process(text):
	global word_dict
	words = split_str(text)
	stat_words(words, word_dict)

# 加載過濾詞彙
def load_filter_words():
	with open("wordsfilter.txt") as f:
		filterwords = split_str(f.read())
	return set(filterwords)

# 保存過濾單詞
filterwords = set()
# 保存單詞字典
word_dict = {}

if len(argv) > 1:
	try:
		# 加載過濾詞彙
		filterwords = load_filter_words()

		del argv[0]
		for fname in argv:
			if os.path.isfile(fname):
				# 行讀取文件,逐行處理文本
				with open(fname) as f:
					for line in f:
						process(line)
		
		# 過濾出現次數太少的單詞
		filter_dict(word_dict)
		# 按單詞出現的次數排序字典
		sorted_word_dict = sort_dict(word_dict)

		# 打印結果
		for item in sorted_word_dict:
			print item

	except IOError, e:
		print str(e)

2. 要過濾的單詞wordsfilter.txt:

#
a
an
the

#
be
am
is
are
been
was
were
do
does
did
done
have
has
had
will
would
can
could
shall
should
may
might
must

#
here
there
this
that
these
those

#
how
what
when
where
which
who
whom

#
i
we
our
ours
you
your
yours
he
she
they
it
his
her
hers
its
their
theirs

#
break
class
const
contiue
double
except
exception
false
final
finally
float
for
else
int
integer
if
long
privated
protected
public
short
static
switch
true
try
while

#
http
https
java
javabeen
javascript
jquery
js
jsf
jsp
mysql
oracle
sql
url
xml
web

#
also
and
but
either
nor
or

#
about
above
across
after
back
before
between
by
from
in
into
of
off
on
out
to
under
up
with
without

#
one
two
three
for
five
six
seven
eight
nine
ten
first
second
third

#
all
better
best
each
even
good
hello
just
once
only
other
many
much
more
most
next
no
not
so
such
too
than
then
very
well
yes

#
come
came
get
give
go
gone
need
set


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章