今天得了一批域名,需要把域名解析成ip
因爲量比較大所以採用了多進程和隊列的方式
from multiprocessing import Process,Queue,Pool
import socket
import multiprocessing
import os
#寫入文件
def write(q,lock,filename):
while not q.empty():
url = q.get()
print (url)
try:
ip = socket.gethostbyname(url)
except:
ip = "unknow"
print (ip)
with open(filename,'a+') as f:
lock.acquire() #加鎖防止多個進程寫入會混亂
try:
f.write(url + " " + ip + "\n")
finally:
lock.release()
#添加到隊列
def readurl(q,n):
with open(str(n)+'.txt','r') as f:
lines = f.readlines()
for line in lines:
q.put(line.strip())
return q
#根據進程進行拆分txt
def multi(urllist,n):
with open(urllist,'r') as f:
lines = f.readlines()
line = int(len(lines)/n)
print (line)
for m in range(0,n):
with open(str(m)+'.txt','a+') as f1:
for i in range(line*m,line*(m+1)):
f1.write(lines[i])
#刪除拆分的txt文件
def remove(n):
for i in range(0,n):
os.remove(str(i)+'.txt')
print ("######清除臨時文件######")
if __name__ == "__main__":
manager = multiprocessing.Manager()
q = manager.Queue()
lock = manager.Lock()
m = 5 #設置掃描進程數
urllist = "url.txt" #待解析的url
filename = "test.txt" #結果保存的文件名
multi(urllist,m)
p = Pool(m)
for i in range(m):
p.apply_async(write,args=(readurl(q,i),lock,filename))
p.close()
p.join()
remove(m)
print ("#######全部文件採集完成########")