#判断邮箱格式
import re
mail = \'^(\w)+(\.\w+)*@(\w)+((\.\w{2,3}){1,3})$\'
myMail = \'45612563@qq.com\'
if re.match(mail, myMail):
print(re.match(mail, myMail).group(0))
else:
print(\'error\')
#用正则表达式识别出全部电话号码。
str = \'\'\'
版权所有:广州商学院 地址:广州市黄埔区九龙大道206号
学校办公室:020-82876130 招生电话:020-82872773
粤公网安备 44011602000060号 粤ICP备15103669号
\'\'\'
print(re.findall(\'(\d{3,4}-(\d{6,8}))\',str))
#用正则表达式进行英文分词。re.split(\'\',news) news = \'\'\'Let your friends underrate your advantage,while let your enemies overrate your disadvantage..\'\'\' print(re.split(\'[\s,.?\-]+\',news))
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
newsUrl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/\'
res = requests.get(newsUrl)
res.encoding = \'utf-8\'
soup = BeautifulSoup(res.text,\'html.parser\')
#获取点击次数
def getClickTime(newsUrl):
newsId = re.findall(\'\_(.*).html\', newsUrl)[0].split(\'/\')[1]
clickUrl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(newsId)
clickStr = requests.get(clickUrl).text
count = re.search("hits\'\).html\(\'(.*)\'\);",clickStr).group(1)
return count
# 获取新闻详情
def getNewDetail(url):
resd = requests.get(url)
resd.encoding = \'utf-8\'
soupd = BeautifulSoup(resd.text, \'html.parser\')
info = soupd.select(\'.show-info\')[0].text
time = re.search(\'发布时间:(.*) \xa0\xa0 \xa0\xa0作者:\', info).group(1)
dtime = datetime.strptime(time, \'%Y-%m-%d %H:%M:%S\')
print(\'链接:\' + url)
print(\'标题:\' + title)
print(\'发布时间:{}\'.format(dtime))
print(\'作者:\' + re.search(\'作者:(.*)审核:\', info).group(1))
print(\'审核:\' + re.search(\'审核:(.*)来源:\', info).group(1))
print(\'来源:\' + re.search(\'来源:(.*)摄影:\', info).group(1))
print(\'摄影:\' + re.search(\'摄影:(.*)点击\', info).group(1))
print(\'点击次数:\' + getClickTime(a))
for news in soup.select(\'li\'):
if len(news.select(\'.news-list-title\'))>0:
title = news.select(\'.news-list-title\')[0].text
#获取新闻模块链接
a = news.a.attrs[\'href\']
#调用函数获取新闻正文
getNewDetail(a)
break
c = soupd.select(\'#content\')[0].text#正文
info=soupd.select(\'.show-info\')[0].text
dt = info.lstrip(\'发布时间:\')[:19]#发布时间
dati = datetime.strptime(dt,\':%Y-%m-%d %H:%M:%S\')
sh = info[info.find(\'作者:\'):].split()[0].lstrip(\'作者:\')#作者
print(dati,t,a,sh,c)
break