【问题标题】:python program freezes when running for a few dayspython程序运行几天就死机
【发布时间】:2016-06-29 20:30:47
【问题描述】:

问题来了:

我正在编写一个 python 程序,其目的是不断地从 RSS 提要中收集新闻。我希望程序收集 1 周的数据。问题是该程序永远不会到周末。有时它在运行几天后冻结,有时几个小时甚至几分钟。它总是冻结,没有错误。当我说冻结时,我的意思是解释器似乎仍在运行,因为我不能给它任何额外的命令。我怎么解决这个问题?

我将在下面发布代码。谢谢大家!!

from goose import Goose
from requests import get
import urllib2
import feedparser
from urllib2 import urlopen
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import datetime as dt
import time
import os
Symbols=['AAPL','T','BA','XOM','GOOG','JPM','PG','WMT']
url='http://finance.yahoo.com/rss/headline?s='

for t in xrange(7):
    AAPL=pd.DataFrame()
    AAPL['Published']=""
    AAPL['Title']=""
    AAPL['link']=""
    AAPL['ID']=""
    AAPL['News']=""

    T=pd.DataFrame()
    T['Published']=""
    T['Title']=""
    T['link']=""
    T['ID']=""
    T['News']=""

    BA=pd.DataFrame()
    BA['Published']=""
    BA['Title']=""
    BA['link']=""
    BA['ID']=""
    BA['News']=""

    XOM=pd.DataFrame()
    XOM['Published']=""
    XOM['Title']=""
    XOM['link']=""
    XOM['ID']=""
    XOM['News']=""

    GOOG=pd.DataFrame()
    GOOG['Published']=""
    GOOG['Title']=""
    GOOG['link']=""
    GOOG['ID']=""
    GOOG['News']=""

    JPM=pd.DataFrame()
    JPM['Published']=""
    JPM['Title']=""
    JPM['link']=""
    JPM['ID']=""
    JPM['News']=""

    PG=pd.DataFrame()
    PG['Published']=""
    PG['Title']=""
    PG['link']=""
    PG['ID']=""
    PG['News']=""

    WMT=pd.DataFrame()
    WMT['Published']=""
    WMT['Title']=""
    WMT['link']=""
    WMT['ID']=""
    WMT['News']=""




    DaysIDsAAPL=[]
    DaysIDsT=[]
    DaysIDsBA=[]
    DaysIDsXOM=[]
    DaysIDsGOOG=[]
    DaysIDsJPM=[]
    DaysIDsPG=[]
    DaysIDsWMT=[]



    count=0

    AAPLCount=0
    TCount=0
    BACount=0
    XOMCount=0
    GOOGCount=0
    JPMCount=0
    PGCount=0
    WMTCount=0

    date=dt.date.today()

    newpathAAPL = r'D:\News Data\AAPL\\'+str(t)
    newpathT = r'D:\News Data\T\\'+str(t)
    newpathBA = r'D:\News Data\BA\\'+str(t)
    newpathXOM = r'D:\News Data\XOM\\'+str(t)
    newpathGOOG = r'D:\News Data\GOOG\\'+str(t)
    newpathJPM = r'D:\News Data\JPM\\'+str(t)
    newpathPG = r'D:\News Data\PG\\'+str(t)
    newpathWMT = r'D:\News Data\WMT\\'+str(t)
    os.makedirs(newpathAAPL)
    os.makedirs(newpathT)
    os.makedirs(newpathBA)
    os.makedirs(newpathXOM)
    os.makedirs(newpathGOOG)
    os.makedirs(newpathJPM)
    os.makedirs(newpathPG)
    os.makedirs(newpathWMT)
    while dt.date.today()==date:
        print "Loop"
        try:
        #AAPL inner most loop
            d1=feedparser.parse(url+Symbols[0])  
            for x in xrange(len(d1['entries'])):
                if int(d1.entries[x]['id'][14:]) not in DaysIDsAAPL:
                    DaysIDsAAPL.append(int(d1.entries[x]['id'][14:]))
                    y = len(AAPL.index.tolist())
                    m=re.search(r'\*(.*)',d1.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    AAPL.loc[y,'Title'] =d1.entries[x]['title'].encode('utf8')
                    AAPL.loc[y,'link'] =m.encode('utf8')
                    AAPL.loc[y,'Published'] =d1.entries[x]['published'].encode('utf8')
                    AAPL.loc[y,'ID'] =int(d1.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            AAPL.loc[y,'News'] = AAPLCount
                            AAPLCount+=1
                            AAPL=AAPL.fillna("")
                            AAPL.to_csv(newpathAAPL+r'\Key.csv')
                        except:
                            print m
                            print "AAPL"
                    else:
                        Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        AAPL.loc[y,'News'] =AAPLCount
                        AAPLCount+=1
                        AAPL=AAPL.fillna("")
                        AAPL.to_csv(newpathAAPL+r'\Key.csv')
                    print "AAPL"

            #T inner most loop
            d2=feedparser.parse(url+Symbols[1])

            for x in xrange(len(d2['entries'])):
                if int(d2.entries[x]['id'][14:]) not in DaysIDsT:
                    DaysIDsT.append(int(d2.entries[x]['id'][14:]))
                    y = len(T.index.tolist())
                    m=re.search(r'\*(.*)',d2.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    T.loc[y,'Title'] =d2.entries[x]['title'].encode('utf8')
                    T.loc[y,'link'] =m.encode('utf8')
                    T.loc[y,'Published'] =d2.entries[x]['published'].encode('utf8')
                    T.loc[y,'ID'] =int(d2.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            T.loc[y,'News'] = TCount
                            TCount+=1
                            T=T.fillna("")
                            T.to_csv(newpathT+r'\Key.csv')
                        except:
                            print m
                            print "T"
                    else:
                        Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        T.loc[y,'News'] =TCount
                        TCount+=1
                        T=T.fillna("")
                        T.to_csv(newpathT+r'\Key.csv')
                    print "T"

            #BA inner most loop
            d3=feedparser.parse(url+Symbols[2])

            for x in xrange(len(d3['entries'])):
                if int(d3.entries[x]['id'][14:]) not in DaysIDsBA:
                    DaysIDsBA.append(int(d3.entries[x]['id'][14:]))
                    y = len(BA.index.tolist())
                    m=re.search(r'\*(.*)',d3.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    BA.loc[y,'Title'] =d3.entries[x]['title'].encode('utf8')
                    BA.loc[y,'link'] =m.encode('utf8')
                    BA.loc[y,'Published'] =d3.entries[x]['published'].encode('utf8')
                    BA.loc[y,'ID'] =int(d3.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            BA.loc[y,'News'] = BACount
                            BACount+=1
                            BA=BA.fillna("")
                            BA.to_csv(newpathBA+r'\Key.csv')
                        except:
                            print m
                            print "BA"
                    else:
                        Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        BA.loc[y,'News'] =BACount
                        BACount+=1
                        BA=BA.fillna("")
                        BA.to_csv(newpathBA+r'\Key.csv')
                    print "BA"

            #XOM inner most loop
            d4=feedparser.parse(url+Symbols[3])

            for x in xrange(len(d4['entries'])):
                if int(d4.entries[x]['id'][14:]) not in DaysIDsXOM:
                    DaysIDsXOM.append(int(d4.entries[x]['id'][14:]))
                    y = len(XOM.index.tolist())
                    m=re.search(r'\*(.*)',d4.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    XOM.loc[y,'Title'] =d4.entries[x]['title'].encode('utf8')
                    XOM.loc[y,'link'] =m.encode('utf8')
                    XOM.loc[y,'Published'] =d4.entries[x]['published'].encode('utf8')
                    XOM.loc[y,'ID'] =int(d4.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            XOM.loc[y,'News'] = XOMCount
                            XOMCount+=1
                            XOM=XOM.fillna("")
                            XOM.to_csv(newpathXOM+r'\Key.csv')
                        except:
                            print m
                            print "XOM"
                    else:
                        Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        XOM.loc[y,'News'] =XOMCount
                        XOMCount+=1
                        XOM=XOM.fillna("")
                        XOM.to_csv(newpathXOM+r'\Key.csv')

            #GOOG inner most loop
            d5=feedparser.parse(url+Symbols[4])

            for x in xrange(len(d5['entries'])):
                if int(d5.entries[x]['id'][14:]) not in DaysIDsGOOG:
                    DaysIDsGOOG.append(int(d5.entries[x]['id'][14:]))
                    y = len(GOOG.index.tolist())
                    m=re.search(r'\*(.*)',d5.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    GOOG.loc[y,'Title'] =d5.entries[x]['title'].encode('utf8')
                    GOOG.loc[y,'link'] =m.encode('utf8')
                    GOOG.loc[y,'Published'] =d5.entries[x]['published'].encode('utf8')
                    GOOG.loc[y,'ID'] =int(d5.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            GOOG.loc[y,'News'] = GOOGCount
                            GOOGCount+=1
                            GOOG=GOOG.fillna("")
                            GOOG.to_csv(newpathGOOG+r'\Key.csv')
                        except:
                            print m
                            print "GOOG"
                    else:
                        Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        GOOG.loc[y,'News'] =GOOGCount
                        GOOGCount+=1
                        GOOG=GOOG.fillna("")
                        GOOG.to_csv(newpathGOOG+r'\Key.csv')
                    print "GOOG"

            #JPM inner most loop
            d6=feedparser.parse(url+Symbols[5])

            for x in xrange(len(d6['entries'])):
                if int(d6.entries[x]['id'][14:]) not in DaysIDsJPM:
                    DaysIDsJPM.append(int(d6.entries[x]['id'][14:]))
                    y = len(JPM.index.tolist())
                    m=re.search(r'\*(.*)',d6.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    JPM.loc[y,'Title'] =d6.entries[x]['title'].encode('utf8')
                    JPM.loc[y,'link'] =m.encode('utf8')
                    JPM.loc[y,'Published'] =d6.entries[x]['published'].encode('utf8')
                    JPM.loc[y,'ID'] =int(d6.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            JPM.loc[y,'News'] = JPMCount
                            JPMCount+=1
                            JPM=JPM.fillna("")
                            JPM.to_csv(newpathJPM+r'\Key.csv')
                        except:
                            print m
                            print "JPM"
                    else:
                        Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        JPM.loc[y,'News'] =JPMCount
                        JPMCount+=1
                        JPM=JPM.fillna("")
                        JPM.to_csv(newpathJPM+r'\Key.csv')
                    print "JPM"


            #PG inner most loop
            d7=feedparser.parse(url+Symbols[6])

            for x in xrange(len(d7['entries'])):
                if int(d7.entries[x]['id'][14:]) not in DaysIDsPG:
                    DaysIDsPG.append(int(d7.entries[x]['id'][14:]))
                    y = len(PG.index.tolist())
                    m=re.search(r'\*(.*)',d7.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    PG.loc[y,'Title'] =d7.entries[x]['title'].encode('utf8')
                    PG.loc[y,'link'] =m.encode('utf8')
                    PG.loc[y,'Published'] =d7.entries[x]['published'].encode('utf8')
                    PG.loc[y,'ID'] =int(d7.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == "":
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            PG.loc[y,'News'] = PGCount
                            PGCount+=1
                            PG=PG.fillna("")
                            PG.to_csv(newpathPG+r'\Key.csv')
                        except:
                            print m
                            print "PG"
                    else:
                        Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        PG.loc[y,'News'] =PGCount
                        PGCount+=1
                        PG=PG.fillna("")
                        PG.to_csv(newpathPG+r'\Key.csv')
                    print "PG"


            #WMT inner most loop
            d8=feedparser.parse(url+Symbols[7])

            for x in xrange(len(d8['entries'])):
                if int(d8.entries[x]['id'][14:]) not in DaysIDsWMT:
                    DaysIDsWMT.append(int(d8.entries[x]['id'][14:]))
                    y = len(WMT.index.tolist())
                    m=re.search(r'\*(.*)',d8.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    WMT.loc[y,'Title'] =d8.entries[x]['title'].encode('utf8')
                    WMT.loc[y,'link'] =m.encode('utf8')
                    WMT.loc[y,'Published'] =d8.entries[x]['published'].encode('utf8')
                    WMT.loc[y,'ID'] =int(d8.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == "":
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            WMT.loc[y,'News'] = WMTCount
                            WMTCount+=1
                            WMT=WMT.fillna("")
                            WMT.to_csv(newpathWMT+r'\Key.csv')
                        except:
                            print m
                            print "WMT"
                    else:
                        Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        WMT.loc[y,'News'] =WMTCount
                        WMTCount+=1
                        WMT=WMT.fillna("")
                        WMT.to_csv(newpathWMT+r'\Key.csv')
                    print "WMT"       
            count+=1
            print count
            time.sleep(1)
        except:
            print "Error"
    AAPL=AAPL.fillna("")
    AAPL.to_csv(newpathAAPL+r'\Key.csv')
    T=T.fillna("")
    T.to_csv(newpathT+r'\Key.csv')
    BA=BA.fillna("")
    BA.to_csv(newpathBA+r'\Key.csv')
    XOM=XOM.fillna("")
    XOM.to_csv(newpathXOM+r'\Key.csv')
    GOOG=GOOG.fillna("")
    GOOG.to_csv(newpathGOOG+r'\Key.csv')
    JPM=JPM.fillna("")
    JPM.to_csv(newpathJPM+r'\Key.csv')
    PG=PG.fillna("")
    PG.to_csv(newpathPG+r'\Key.csv')
    WMT=WMT.fillna("")
    WMT.to_csv(newpathWMT+r'\Key.csv')

【问题讨论】:

  • 你有很多重复的代码;您可以将繁琐的代码提取到可以传入参数的辅助方法(小函数)中
  • 如果您的代码没有错误,请尝试禁用 gc 并手动收集垃圾(如果您使用的是 2.6)。
  • 也许尝试分解代码以便能够隔离问题。将所有这些部分分隔在不同的功能中,并添加一个负责跟踪的装饰器,例如timestamp - func name started - parameterstimestamp - func name ended - duration。您调用的服务可能存在处理不当的问题,但在 500 行程序中很难判断!
  • 大家好,感谢您的建议,我会尝试并报告发生的情况。对代码的混乱感到抱歉,我知道那里有很多冗余,我更多的是为了时间而不是为了清洁。显然是后视镜不好,大声笑

标签: python freeze


【解决方案1】:

程序在收集过多的提要或系统上有其他活动进程时消耗过多的 RAM(这就是冻结时间不同的原因),请参阅Why does a simple python script crash my system

您的程序运行的进程将用于计算的数组和变量存储在 进程内存 内存中

您可以通过强制程序使用硬盘内存来解决此问题。

有关解决方法(shelve定期将收集的提要保存到文本文件(将信息从 ram 移动到 rom 并释放 ram),...)请参阅以下链接

memory usage, how to free memory

Python large variable RAM usage

I need to free up RAM by storing a Python dictionary on the hard drive, not in RAM. Is it possible?

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2020-10-23
    • 2020-09-16
    • 2022-11-18
    相关资源
    最近更新 更多