Runka
#判断邮箱格式
import re
mail = \'^(\w)+(\.\w+)*@(\w)+((\.\w{2,3}){1,3})$\'
myMail = \'45612563@qq.com\'
if re.match(mail, myMail):
    print(re.match(mail, myMail).group(0))
else:
    print(\'error\')

  

#用正则表达式识别出全部电话号码。
str = \'\'\'
版权所有:广州商学院   地址:广州市黄埔区九龙大道206号
学校办公室:020-82876130   招生电话:020-82872773
粤公网安备 44011602000060号    粤ICP备15103669号
\'\'\'
print(re.findall(\'(\d{3,4}-(\d{6,8}))\',str))

  

#用正则表达式进行英文分词。re.split(\'\',news)
news = \'\'\'Let your friends underrate your advantage,while let your enemies overrate your disadvantage..\'\'\'
print(re.split(\'[\s,.?\-]+\',news))

  

import requests
from  bs4 import  BeautifulSoup
from datetime import datetime
import re

newsUrl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/\'
res = requests.get(newsUrl)
res.encoding = \'utf-8\'
soup = BeautifulSoup(res.text,\'html.parser\')

#获取点击次数
def getClickTime(newsUrl):
    newsId = re.findall(\'\_(.*).html\', newsUrl)[0].split(\'/\')[1]
    clickUrl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(newsId)
    clickStr = requests.get(clickUrl).text
    count = re.search("hits\'\).html\(\'(.*)\'\);",clickStr).group(1)
    return count


# 获取新闻详情
def getNewDetail(url):
    resd = requests.get(url)
    resd.encoding = \'utf-8\'
    soupd = BeautifulSoup(resd.text, \'html.parser\')
    info = soupd.select(\'.show-info\')[0].text
    time = re.search(\'发布时间:(.*) \xa0\xa0 \xa0\xa0作者:\', info).group(1)
    dtime = datetime.strptime(time, \'%Y-%m-%d %H:%M:%S\')

    print(\'链接:\' + url)
    print(\'标题:\' + title)
    print(\'发布时间:{}\'.format(dtime))
    print(\'作者:\' + re.search(\'作者:(.*)审核:\', info).group(1))
    print(\'审核:\' + re.search(\'审核:(.*)来源:\', info).group(1))
    print(\'来源:\' + re.search(\'来源:(.*)摄影:\', info).group(1))
    print(\'摄影:\' + re.search(\'摄影:(.*)点击\', info).group(1))
    print(\'点击次数:\' + getClickTime(a))


for news in soup.select(\'li\'):
    if len(news.select(\'.news-list-title\'))>0:
        title = news.select(\'.news-list-title\')[0].text
        #获取新闻模块链接
        a = news.a.attrs[\'href\']
        #调用函数获取新闻正文
        getNewDetail(a)
        break

        c = soupd.select(\'#content\')[0].text#正文
        info=soupd.select(\'.show-info\')[0].text
        dt = info.lstrip(\'发布时间:\')[:19]#发布时间
        dati = datetime.strptime(dt,\':%Y-%m-%d %H:%M:%S\')
        sh = info[info.find(\'作者:\'):].split()[0].lstrip(\'作者:\')#作者

        print(dati,t,a,sh,c)
        break

  

 

分类:

技术点:

相关文章: