年前接到的一个新需求,把公司旧门户网站的新闻迁移到新的门户网站上,我的公司的网址是 www.jylink.com
像这种好玩的事,怎么能少了我这种小机智,开始动手来做
我的目标是把从2012-2017年的公司的新闻存到数据库中.喏,就是这样的新闻,图片上有我们帅气的董事长
保存到如下两张数据表中:
先爬取每页的新闻列表保存到本地去
# 获取新闻列表并保存到本地
def getNewsList():
# 读取配置文件
cf = config_util.readConfig('URLConfig.ini')
# 读取 爬取新闻列表的新闻地址
news_url = cf.get('url','jylink_news')
# 读取存储分页列表的地址
filePath = cf.get('file_location','news_list')
# URL 参数
data={"pg":1}
r = requests.get(news_url,data)
# 这里的 18 随着新闻的页数而改变,新闻的页数+1
for i in range(1,18):
data['pg']=i
r = requests.get(news_url,data)
file_name = filePath+"\\第"+str(i)+"页.html"
target = codecs.open(file_name,'w','utf-8')
content=r.text
target.write(content)
然后对新闻列表页进行数据清洗和保存
# 清洗数据
# current_page 当前页
#
def getNewsPage(current_page):
# 读取配置文件
cf = config_util.readConfig('URLConfig.ini')
# 读取存储分页列表的地址
filePath = cf.get('file_location','news_list')
file_name = filePath+"\\第"+current_page+"页.html"
f=codecs.open(file_name,"r","utf-8")
# 读取文本内容
content=f.read()
f.close()
html_content=etree.HTML(content)
result = html_content.xpath('//div[@class="news"]//a')
s=1
part_news_detail_array = []
for i in result:
item_array = []
site_id=1
column_id=33
form_id=1
topic_id=0
keyword=i.xpath('.//text()')[3]
title_name=i.xpath('./p/big/text()')
clicks=0
create_people='admin'
create_time=i.xpath('./span/text()')[0]+"-"+i.xpath('./span/font/text()')[0]
check_status = 1
status=1
saveOfCmsContent(site_id,column_id,form_id,topic_id,title_name[0],clicks,create_people,create_time,check_status,status)
print('title name',title_name)
print('')
print(s)
s=s+1
item_array.append(create_time)
item_array.append(keyword)
part_news_detail_array.append(item_array)
return part_news_detail_array
最后对新闻详情进行数据处理和保存,其中对部分新闻的属性进行处理的时候用的是xpath 节点获取的,然后因为新闻详情用的是富文本编辑器编辑的,所以我又用正则表达式进行了处理和获取
def saveNewsDetail(current_page):
page_array = []
# 读取配置文件
cf = config_util.readConfig('URLConfig.ini')
# 读取存储分页列表的地址
filePath = cf.get('file_location','news_list')
# 新闻详情的存储地址
storage_path = cf.get('file_location','news_detail')
# 新闻每一页的 文件名
file_name = filePath+"\\第"+str(current_page)+"页.html"
# 打开读取这个文件
f=codecs.open(file_name,"r","utf-8")
# 读取内容
content = f.read()
f.close()
html_content=etree.HTML(content)
result = html_content.xpath('//div[@class="news"]//a/@href')
for i in result:
news_detail_array = []
pid = parse_qs(i)['NewsDefault.aspx?pid']
file_name = storage_path+"\\"+str(current_page)+'---'+pid[0]+".html"
target = codecs.open(file_name,'r','utf-8')
detail = target.read()
detail_html = etree.HTML(detail)
content_array = re.findall(r'<span class=\"newdet\">(.*?)</span></div>',detail,re.S|re.M)
content=content_array[0]
form_id=1
column_id=33
site_id=1
title_name = detail_html.xpath('//div[@id="ctl00_ContentPlaceHolder2_countData"]/span[@class="newdet_Title yahei"]/text()')[0]
source = '本站'
news_detail_array.append(column_id)
news_detail_array.append(form_id)
news_detail_array.append(site_id)
news_detail_array.append(title_name)
news_detail_array.append(source)
news_detail_array.append(content)
target.close()
page_array.append(news_detail_array)
return page_array
最后就是把上个方法返回的数组进行了处理,保存
def saveOfCmsContentField(column_id,form_id,site_id,title_name,source,content,content_id):
db = db_util.getDBConnect()
cursor = db.cursor()
column_id_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"column_id",%s)'
form_id_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"form_id",%s)'
site_id_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"site_id",%s)'
title_name_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"title_name",%s)'
source_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"source",%s)'
content_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"content",%s)'
try:
num1 = cursor.execute(column_id_sql,(content_id,column_id))
#cursor.execute(thumbnail_sql,(content_id,thumbnail))
num2 = cursor.execute(form_id_sql,(content_id,form_id))
num3 = cursor.execute(site_id_sql,(content_id,site_id))
num4 = cursor.execute(title_name_sql,(content_id,title_name))
num5 = cursor.execute(source_sql,(content_id,source))
num6 = cursor.execute(content_sql,(content_id,content))
print(f'增加了{num1+num2+num3+num4+num5+num6}条')
db.commit()
except Exception as e:
print("Unexpected Error: {}".format(e))
db.rollback()
db.close()
def saveOfCmsContentFieldPart2(content_id,create_time,keyword):
db = db_util.getDBConnect()
cursor = db.cursor()
print(content_id,create_time,keyword)
create_time_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"create_time",%s)'
keyword_sql = 'insert into of_cms_content_field(content_id, form_id,name,value) VALUES(%s,1,"keyword",%s)'
try:
num1 = cursor.execute(create_time_sql,(content_id,create_time))
num2 = cursor.execute(keyword_sql,(content_id,keyword))
db.commit()
print(f'保存成功!,增加{num1+num2}个')
except Exception as e:
print("Unexpected Error: {}".format(e))
db.rollback()
db.close()