年前接到的一个新需求,把公司旧门户网站的新闻迁移到新的门户网站上,我的公司的网址是  www.jylink.com

像这种好玩的事,怎么能少了我这种小机智,开始动手来做

我的目标是把从2012-2017年的公司的新闻存到数据库中.喏,就是这样的新闻,图片上有我们帅气的董事长

迁移公司旧门户网站新闻到新的门户网站

保存到如下两张数据表中:

迁移公司旧门户网站新闻到新的门户网站

迁移公司旧门户网站新闻到新的门户网站

先爬取每页的新闻列表保存到本地去

# 获取新闻列表并保存到本地
def getNewsList():
    # 读取配置文件
    cf = config_util.readConfig('URLConfig.ini')    
    # 读取 爬取新闻列表的新闻地址
    news_url = cf.get('url','jylink_news')
    # 读取存储分页列表的地址
    filePath = cf.get('file_location','news_list')
    # URL 参数
    data={"pg":1}
    r = requests.get(news_url,data)
    # 这里的 18 随着新闻的页数而改变,新闻的页数+1
    for i in range(1,18):
        data['pg']=i
        r = requests.get(news_url,data)
        file_name = filePath+"\\第"+str(i)+"页.html"
        target = codecs.open(file_name,'w','utf-8')
        content=r.text
        target.write(content)

然后对新闻列表页进行数据清洗和保存

# 清洗数据
# current_page 当前页
# 
def getNewsPage(current_page):
    # 读取配置文件
    cf = config_util.readConfig('URLConfig.ini')
    # 读取存储分页列表的地址
    filePath = cf.get('file_location','news_list')
    file_name = filePath+"\\第"+current_page+"页.html"
    f=codecs.open(file_name,"r","utf-8")
    # 读取文本内容
    content=f.read()
    f.close()
    html_content=etree.HTML(content)
    result = html_content.xpath('//div[@class="news"]//a')
    s=1
    part_news_detail_array = []
    for i in result:
        item_array = []
        site_id=1
        column_id=33
        form_id=1
        topic_id=0
        keyword=i.xpath('.//text()')[3]
    
        title_name=i.xpath('./p/big/text()')
        clicks=0
        create_people='admin'
        create_time=i.xpath('./span/text()')[0]+"-"+i.xpath('./span/font/text()')[0]
        check_status = 1
        status=1
        saveOfCmsContent(site_id,column_id,form_id,topic_id,title_name[0],clicks,create_people,create_time,check_status,status)
        print('title name',title_name)
        print('')
        print(s)
        s=s+1
        item_array.append(create_time)
        item_array.append(keyword)
        part_news_detail_array.append(item_array)
    return part_news_detail_array

最后对新闻详情进行数据处理和保存,其中对部分新闻的属性进行处理的时候用的是xpath 节点获取的,然后因为新闻详情用的是富文本编辑器编辑的,所以我又用正则表达式进行了处理和获取

def saveNewsDetail(current_page):
    page_array = []
   
     # 读取配置文件
    cf = config_util.readConfig('URLConfig.ini')
    # 读取存储分页列表的地址
    filePath = cf.get('file_location','news_list')
    # 新闻详情的存储地址
    storage_path = cf.get('file_location','news_detail')
    # 新闻每一页的 文件名
    file_name = filePath+"\\第"+str(current_page)+"页.html"
    # 打开读取这个文件
    f=codecs.open(file_name,"r","utf-8")
    # 读取内容
    content = f.read()
    f.close()
    html_content=etree.HTML(content)
    result = html_content.xpath('//div[@class="news"]//a/@href')
    for i in result:
        news_detail_array = []
        pid = parse_qs(i)['NewsDefault.aspx?pid']
        file_name = storage_path+"\\"+str(current_page)+'---'+pid[0]+".html"
        target = codecs.open(file_name,'r','utf-8')
        detail = target.read()
        detail_html = etree.HTML(detail)
        content_array = re.findall(r'<span class=\"newdet\">(.*?)</span></div>',detail,re.S|re.M)
        content=content_array[0]
        form_id=1
        column_id=33
        site_id=1
        title_name = detail_html.xpath('//div[@id="ctl00_ContentPlaceHolder2_countData"]/span[@class="newdet_Title yahei"]/text()')[0]
        source = '本站'
        
        news_detail_array.append(column_id)
        news_detail_array.append(form_id)
        news_detail_array.append(site_id)
        news_detail_array.append(title_name)
        news_detail_array.append(source)
        news_detail_array.append(content)
        target.close()
        page_array.append(news_detail_array)
    return page_array

最后就是把上个方法返回的数组进行了处理,保存

def saveOfCmsContentField(column_id,form_id,site_id,title_name,source,content,content_id):
    db = db_util.getDBConnect()
    cursor = db.cursor()
    column_id_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"column_id",%s)'
    form_id_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"form_id",%s)'
    site_id_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"site_id",%s)'
    title_name_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"title_name",%s)'
    source_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"source",%s)'  
    content_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"content",%s)'
    try:
        num1 = cursor.execute(column_id_sql,(content_id,column_id))   
        #cursor.execute(thumbnail_sql,(content_id,thumbnail))
        num2 = cursor.execute(form_id_sql,(content_id,form_id))
        num3 = cursor.execute(site_id_sql,(content_id,site_id))
        num4 = cursor.execute(title_name_sql,(content_id,title_name))
        num5 = cursor.execute(source_sql,(content_id,source))
        num6 = cursor.execute(content_sql,(content_id,content))
        print(f'增加了{num1+num2+num3+num4+num5+num6}条')
        db.commit()
    except Exception as e:
        print("Unexpected Error: {}".format(e))
        db.rollback()
    db.close()
def saveOfCmsContentFieldPart2(content_id,create_time,keyword):
    db = db_util.getDBConnect()
    cursor = db.cursor()
    print(content_id,create_time,keyword)
    create_time_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"create_time",%s)'
    keyword_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"keyword",%s)'
    try:
        num1 = cursor.execute(create_time_sql,(content_id,create_time))
        num2 = cursor.execute(keyword_sql,(content_id,keyword))
        db.commit()
        print(f'保存成功!,增加{num1+num2}个')
    except Exception as e:
        print("Unexpected Error: {}".format(e))
        db.rollback()
    db.close()

 

相关文章: