下面這個示例的功能包含:
(1)實現python多進程;
(2)進程間共享變量company_queue (通過Manager().Queue()實現)
(3)每個進程最後輸出一個獨立的結果(保存路徑通過參數傳入);
(4)實現json數據整合成一個dataframe並輸出到文件(支持csv和excel兩種格式);
(5)部分函數的具體實現略去。
示例代碼:
#coding=utf-8
import json
import xlsxwriter
import pandas as pd
from multiprocessing import Pool, Process, Manager, Queue
def get_all_data(companyName):
"""
根據公司名取獲取相關數據
"""
try:
all_result = []
except Exception as e:
print(str(e))
return all_result
def get_all_company(file_path):
"""
加載所有的公司
"""
company_list = []
return company_list
def get_all_dataframe(all_result, temp_dic):
"""
將結果保存到csv或者xlsx文件中,默認csv文件
"""
for key in temp_dic:
temp_dic[key] = str(temp_dic[key])
temp_dic = pd.DataFrame(temp_dic, index=[0])
#print(temp_dic)
if len(all_result) == 0:
all_result = temp_dic
else:
all_result = all_result.append(temp_dic)
return all_result
def Run(company_queue, output_path, mode):
"""
某個進程所執行的整體過程
"""
all_result = pd.DataFrame()
while company_queue.empty() == False:
companyName = company_queue.get()
all_result = get_all_data(companyName)
if mode == 0:
file_path = output_path + '.csv'
all_result.to_csv(file_path, index=False)
else:
file_path = output_path + '.xlsx'
writer = pd.ExcelWriter(file_path, engine='xlsxwriter')
all_result.to_excel(writer,'Sheet1', index=False)
writer.save()
if __name__ == '__main__':
#file_path = './data/all_test_company.txt'
file_path = './company_file.txt'
output_path = './result/result_'
company_list = get_all_company(file_path)
max_process = 10
company_queue = Manager().Queue()
for conpany_name in company_list:
company_queue.put(conpany_name)
pool = Pool(processes=max_process)
for i in range(max_process):
pool.apply_async(Run, args=(company_queue, output_path+str(i), 1, ))
pool.close() #執行完close後不會有新的進程加入到pool,join函數等待所有子進程結束
pool.join() #調用join之前,先調用close函數,否則會出錯。
print('finished!')