【发布时间】:2016-02-24 12:19:19
【问题描述】:
我有一个使用 ElementTree Path Evaluator 解析 xml 文件的脚本。它工作正常,但它需要很长时间才能完成。所以我尝试做一个多线程的实现:
import fnmatch
import operator
import os
import lxml.etree
from nltk import FreqDist
from nltk.corpus import stopwords
from collections import defaultdict
from datetime import datetime
import threading
import Queue
STOPWORDS = stopwords.words('dutch')
STOPWORDS.extend(stopwords.words('english'))
DIR_NAME = 'A_DIRNAME'
PATTERN = '*.A_PATTERN'
def loadData(dir_name, pattern):
nohyphen_files = []
dir_names = []
dir_paths = []
for root, dirnames, filenames in os.walk(dir_name):
dir_names.append(dirnames)
dir_paths.append(root)
for filename in fnmatch.filter(filenames, pattern):
nohyphen_files.append(os.path.join(root, filename))
return nohyphen_files, dir_names, dir_paths
def freq(element_list, descending = True):
agglomerated = defaultdict(int)
for e in element_list:
agglomerated[e] += 1
return sorted(agglomerated.items(), key=operator.itemgetter(1), reverse=descending)
def lexDiv(amount_words):
return 1.0*len(set(amount_words))/len(amount_words)
def anotherFreq(list_types, list_words):
fd = FreqDist(list_types)
print 'top 10 most frequent types:'
for t, freq in fd.items()[:10]:
print t, freq
print '\ntop 10 most frequent words:'
agglomerated = defaultdict(int)
for w in list_words:
if not w.lower() in STOPWORDS:
agglomerated[w] += 1
sorted_dict = sorted(agglomerated.items(), key=operator.itemgetter(1),reverse=True)
print sorted_dict[:10]
def extractor(f):
print "check file: {}".format(f)
try:
# doc = lxml.etree.ElementTree(lxml.etree.XML(f))
doc = lxml.etree.ElementTree(file=f)
except lxml.etree.XMLSyntaxError, e:
print e
return
doc_evaluator = lxml.etree.XPathEvaluator(doc)
entities = doc_evaluator('//entity/*/externalRef/@reference')
places_dbpedia = doc_evaluator('//entity[contains(@type, "Schema:Place")]/*/externalRef/@reference')
non_people_dbpedia = set(doc_evaluator('//entity[not(contains(@type, "Schema:Person"))]'))
people = doc_evaluator('//entity[contains(@type, "Schema:Person")]/*/externalRef/@reference')
words = doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
unique_words = set(words)
other_tokens = doc.xpath('text/wf[re:match(text(), "[^A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
amount_of_sentences = doc_evaluator('text/wf/@sent')[-1]
types = doc_evaluator('//term/@morphofeat')
longest_sentence = freq(doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/@sent',\
namespaces={"re": "http://exslt.org/regular-expressions"}))[0]
top_people = freq([e.split('/')[-1] for e in people])[:10]
top_entities = freq([e.split('/')[-1] for e in entities])[:10]
top_places = freq([e.split('/')[-1] for e in places_dbpedia])[:10]
def worker():
while 1:
job_number = q.get()
extractor(job_number)
q.task_done() #this thread is complete, move on
if __name__ =='__main__':
startTime = datetime.now()
files, dirs, path = loadData(DIR_NAME, PATTERN)
startTime = datetime.now()
q = Queue.Queue()# job queue
for f in files:
q.put(f)
for i in range(20): #make 20 workerthreads ready
worker_thread = threading.Thread(target=worker)
worker_thread.daemon = True
worker_thread.start()
q.join()
print datetime.now() - startTime
这有一些作用,但是在计时时,它并不比普通版本快。我认为这与打开和读取文件有关,使线程不是多线程的。如果我使用一个函数而不是解析 xml 文件只是休眠几秒钟并打印一些东西,它确实可以工作并且速度要快得多。拥有多线程 XML 解析器需要注意什么?
【问题讨论】:
-
使用
threading只会使您编写的代码并行化。它实际上并没有让它穿过 CPU 中的核心(如果我在这里错了,请纠正我)。而且从单个驱动器读取也将是一个瓶颈,因为磁盘本身在给定时间只能处理这么多的 I/O。您最多可以从并行化(?!?)代码中获得几秒钟甚至几分钟的时间。您需要的是更好的存储量,并且可能需要一种缓存机制以加快读取速度。如果可能,请先尝试将文件读入 RAM 或数据库作为缓存,然后再使用它们。或者 RAID 你的磁盘。 -
@Torxed 我的目标不是让它跨越 CPU 的内核。并行化是我的目标,我想要说明的是,运行一个执行某种解析的函数会使整个编程阻塞。所以假设函数提取器只是休眠一秒钟并打印一些东西然后它可以工作,但是在解析 xml 文件时它不能以并行方式工作。你知道它是否与 XPathEvaluator 有关,是否有解决方法?
标签: python xpath lxml elementtree python-multithreading