【发布时间】:2016-06-29 20:30:47
【问题描述】:
问题来了:
我正在编写一个 python 程序,其目的是不断地从 RSS 提要中收集新闻。我希望程序收集 1 周的数据。问题是该程序永远不会到周末。有时它在运行几天后冻结,有时几个小时甚至几分钟。它总是冻结,没有错误。当我说冻结时,我的意思是解释器似乎仍在运行,因为我不能给它任何额外的命令。我怎么解决这个问题?
我将在下面发布代码。谢谢大家!!
from goose import Goose
from requests import get
import urllib2
import feedparser
from urllib2 import urlopen
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import datetime as dt
import time
import os
Symbols=['AAPL','T','BA','XOM','GOOG','JPM','PG','WMT']
url='http://finance.yahoo.com/rss/headline?s='
for t in xrange(7):
AAPL=pd.DataFrame()
AAPL['Published']=""
AAPL['Title']=""
AAPL['link']=""
AAPL['ID']=""
AAPL['News']=""
T=pd.DataFrame()
T['Published']=""
T['Title']=""
T['link']=""
T['ID']=""
T['News']=""
BA=pd.DataFrame()
BA['Published']=""
BA['Title']=""
BA['link']=""
BA['ID']=""
BA['News']=""
XOM=pd.DataFrame()
XOM['Published']=""
XOM['Title']=""
XOM['link']=""
XOM['ID']=""
XOM['News']=""
GOOG=pd.DataFrame()
GOOG['Published']=""
GOOG['Title']=""
GOOG['link']=""
GOOG['ID']=""
GOOG['News']=""
JPM=pd.DataFrame()
JPM['Published']=""
JPM['Title']=""
JPM['link']=""
JPM['ID']=""
JPM['News']=""
PG=pd.DataFrame()
PG['Published']=""
PG['Title']=""
PG['link']=""
PG['ID']=""
PG['News']=""
WMT=pd.DataFrame()
WMT['Published']=""
WMT['Title']=""
WMT['link']=""
WMT['ID']=""
WMT['News']=""
DaysIDsAAPL=[]
DaysIDsT=[]
DaysIDsBA=[]
DaysIDsXOM=[]
DaysIDsGOOG=[]
DaysIDsJPM=[]
DaysIDsPG=[]
DaysIDsWMT=[]
count=0
AAPLCount=0
TCount=0
BACount=0
XOMCount=0
GOOGCount=0
JPMCount=0
PGCount=0
WMTCount=0
date=dt.date.today()
newpathAAPL = r'D:\News Data\AAPL\\'+str(t)
newpathT = r'D:\News Data\T\\'+str(t)
newpathBA = r'D:\News Data\BA\\'+str(t)
newpathXOM = r'D:\News Data\XOM\\'+str(t)
newpathGOOG = r'D:\News Data\GOOG\\'+str(t)
newpathJPM = r'D:\News Data\JPM\\'+str(t)
newpathPG = r'D:\News Data\PG\\'+str(t)
newpathWMT = r'D:\News Data\WMT\\'+str(t)
os.makedirs(newpathAAPL)
os.makedirs(newpathT)
os.makedirs(newpathBA)
os.makedirs(newpathXOM)
os.makedirs(newpathGOOG)
os.makedirs(newpathJPM)
os.makedirs(newpathPG)
os.makedirs(newpathWMT)
while dt.date.today()==date:
print "Loop"
try:
#AAPL inner most loop
d1=feedparser.parse(url+Symbols[0])
for x in xrange(len(d1['entries'])):
if int(d1.entries[x]['id'][14:]) not in DaysIDsAAPL:
DaysIDsAAPL.append(int(d1.entries[x]['id'][14:]))
y = len(AAPL.index.tolist())
m=re.search(r'\*(.*)',d1.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
AAPL.loc[y,'Title'] =d1.entries[x]['title'].encode('utf8')
AAPL.loc[y,'link'] =m.encode('utf8')
AAPL.loc[y,'Published'] =d1.entries[x]['published'].encode('utf8')
AAPL.loc[y,'ID'] =int(d1.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
AAPL.loc[y,'News'] = AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
except:
print m
print "AAPL"
else:
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
AAPL.loc[y,'News'] =AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
print "AAPL"
#T inner most loop
d2=feedparser.parse(url+Symbols[1])
for x in xrange(len(d2['entries'])):
if int(d2.entries[x]['id'][14:]) not in DaysIDsT:
DaysIDsT.append(int(d2.entries[x]['id'][14:]))
y = len(T.index.tolist())
m=re.search(r'\*(.*)',d2.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
T.loc[y,'Title'] =d2.entries[x]['title'].encode('utf8')
T.loc[y,'link'] =m.encode('utf8')
T.loc[y,'Published'] =d2.entries[x]['published'].encode('utf8')
T.loc[y,'ID'] =int(d2.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
T.loc[y,'News'] = TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
except:
print m
print "T"
else:
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
T.loc[y,'News'] =TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
print "T"
#BA inner most loop
d3=feedparser.parse(url+Symbols[2])
for x in xrange(len(d3['entries'])):
if int(d3.entries[x]['id'][14:]) not in DaysIDsBA:
DaysIDsBA.append(int(d3.entries[x]['id'][14:]))
y = len(BA.index.tolist())
m=re.search(r'\*(.*)',d3.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
BA.loc[y,'Title'] =d3.entries[x]['title'].encode('utf8')
BA.loc[y,'link'] =m.encode('utf8')
BA.loc[y,'Published'] =d3.entries[x]['published'].encode('utf8')
BA.loc[y,'ID'] =int(d3.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
BA.loc[y,'News'] = BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
except:
print m
print "BA"
else:
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(text)
Text_file.close()
BA.loc[y,'News'] =BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
print "BA"
#XOM inner most loop
d4=feedparser.parse(url+Symbols[3])
for x in xrange(len(d4['entries'])):
if int(d4.entries[x]['id'][14:]) not in DaysIDsXOM:
DaysIDsXOM.append(int(d4.entries[x]['id'][14:]))
y = len(XOM.index.tolist())
m=re.search(r'\*(.*)',d4.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
XOM.loc[y,'Title'] =d4.entries[x]['title'].encode('utf8')
XOM.loc[y,'link'] =m.encode('utf8')
XOM.loc[y,'Published'] =d4.entries[x]['published'].encode('utf8')
XOM.loc[y,'ID'] =int(d4.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
XOM.loc[y,'News'] = XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
except:
print m
print "XOM"
else:
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
XOM.loc[y,'News'] =XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
#GOOG inner most loop
d5=feedparser.parse(url+Symbols[4])
for x in xrange(len(d5['entries'])):
if int(d5.entries[x]['id'][14:]) not in DaysIDsGOOG:
DaysIDsGOOG.append(int(d5.entries[x]['id'][14:]))
y = len(GOOG.index.tolist())
m=re.search(r'\*(.*)',d5.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
GOOG.loc[y,'Title'] =d5.entries[x]['title'].encode('utf8')
GOOG.loc[y,'link'] =m.encode('utf8')
GOOG.loc[y,'Published'] =d5.entries[x]['published'].encode('utf8')
GOOG.loc[y,'ID'] =int(d5.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
GOOG.loc[y,'News'] = GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
except:
print m
print "GOOG"
else:
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
GOOG.loc[y,'News'] =GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
print "GOOG"
#JPM inner most loop
d6=feedparser.parse(url+Symbols[5])
for x in xrange(len(d6['entries'])):
if int(d6.entries[x]['id'][14:]) not in DaysIDsJPM:
DaysIDsJPM.append(int(d6.entries[x]['id'][14:]))
y = len(JPM.index.tolist())
m=re.search(r'\*(.*)',d6.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
JPM.loc[y,'Title'] =d6.entries[x]['title'].encode('utf8')
JPM.loc[y,'link'] =m.encode('utf8')
JPM.loc[y,'Published'] =d6.entries[x]['published'].encode('utf8')
JPM.loc[y,'ID'] =int(d6.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
JPM.loc[y,'News'] = JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
except:
print m
print "JPM"
else:
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
JPM.loc[y,'News'] =JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
print "JPM"
#PG inner most loop
d7=feedparser.parse(url+Symbols[6])
for x in xrange(len(d7['entries'])):
if int(d7.entries[x]['id'][14:]) not in DaysIDsPG:
DaysIDsPG.append(int(d7.entries[x]['id'][14:]))
y = len(PG.index.tolist())
m=re.search(r'\*(.*)',d7.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
PG.loc[y,'Title'] =d7.entries[x]['title'].encode('utf8')
PG.loc[y,'link'] =m.encode('utf8')
PG.loc[y,'Published'] =d7.entries[x]['published'].encode('utf8')
PG.loc[y,'ID'] =int(d7.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
PG.loc[y,'News'] = PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
except:
print m
print "PG"
else:
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
PG.loc[y,'News'] =PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
print "PG"
#WMT inner most loop
d8=feedparser.parse(url+Symbols[7])
for x in xrange(len(d8['entries'])):
if int(d8.entries[x]['id'][14:]) not in DaysIDsWMT:
DaysIDsWMT.append(int(d8.entries[x]['id'][14:]))
y = len(WMT.index.tolist())
m=re.search(r'\*(.*)',d8.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
WMT.loc[y,'Title'] =d8.entries[x]['title'].encode('utf8')
WMT.loc[y,'link'] =m.encode('utf8')
WMT.loc[y,'Published'] =d8.entries[x]['published'].encode('utf8')
WMT.loc[y,'ID'] =int(d8.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
WMT.loc[y,'News'] = WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
except:
print m
print "WMT"
else:
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
WMT.loc[y,'News'] =WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
print "WMT"
count+=1
print count
time.sleep(1)
except:
print "Error"
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
【问题讨论】:
-
你有很多重复的代码;您可以将繁琐的代码提取到可以传入参数的辅助方法(小函数)中
-
如果您的代码没有错误,请尝试禁用 gc 并手动收集垃圾(如果您使用的是 2.6)。
-
也许尝试分解代码以便能够隔离问题。将所有这些部分分隔在不同的功能中,并添加一个负责跟踪的装饰器,例如
timestamp - func name started - parameters和timestamp - func name ended - duration。您调用的服务可能存在处理不当的问题,但在 500 行程序中很难判断! -
大家好,感谢您的建议,我会尝试并报告发生的情况。对代码的混乱感到抱歉,我知道那里有很多冗余,我更多的是为了时间而不是为了清洁。显然是后视镜不好,大声笑