代码可直接复制到python文件中进行运行
# 1. 文件内创建函数 # 内建函数和方法 # open() 打开文件 # read() 输入 # readline() 输入一行 # seek() 文件移动 # write() 输出 # close() 关闭文件 # 写入文件,执行完成后生成txt文件 file1 = open(\'name.txt\', \'w\') file1.write("20200202") file1.close() # 读取文件 file2 = open(\'name.txt\') str = file2.read() print(str) file2.close() # 编辑文件 file3 = open(\'name.txt\', \'a\') # 字符中带\n输入进行换行 file3.write("\n11111") file3.close() # 读取一行 file4 = open(\'name.txt\') print(file4.readline()) file4.close() # 逐行读取 file5 = open(\'name.txt\') for str_1 in file5.readlines(): print(str_1) file5.close() # 操作完成之后鼠标指针行首 file6 = open(\'name.txt\') print(file6.readline()) # 回到行首 print(file6.seek(0)) file6.close() # 2.python异常的检测和处理 try: a = 1 / 0 except Exception as e: print(\'捕获到的异常是 %s\' % e) finally: print(\'最终都会执行的语句\') # 3.python的 可变参数 def howLong(first, *other): print(first) print(other) howLong(\'123\', \'1222\', \'1111\') # 4.函数的迭代器和生成器 list1 = {1, 2, 3} it = iter(list1) # 迭代器next() print(next(it)) print(next(it)) print(next(it)) def frange(start, stop, step): x = start while x < stop: # 生成器关键字 yield yield x x += step for i in frange(10, 12, 0.5): print(i) # 5.Lambda表达式:匿名函数 add = lambda x, y: x + y print(add(2, 4)) # 6.python的内建函数 a = [1, 2, 34, 5, 6] # filter():够快a中大于2的数 print(list(filter(lambda x: x > 2, a))) # map():依次a中的数加一 print(list(map(lambda x: x + 1, a))) # 多个列表处理:a,b中第一个元素相加 b = [3, 4, 5, 9] print(list(map(lambda x, y: x + y, a, b))) # reduce使用需要引入:完成数字累加 from functools import reduce print(reduce(lambda x, y: x + y, [1, 2, 3], 4)) # zip进行矩阵转换 dicta = {\'aa\': \'a\', \'bb\': \'b\', \'cc\': \'c\'} dictc = zip(dicta.values(), dicta.keys()) print(list(dictc)) # 7. python 的闭包:嵌套函数 def sum(a): def add(b): return a + b return add num27 = sum(2) print(num27(4)) # 8.python多线程 import threading from threading import current_thread class Mythread(threading.Thread): def run(self): print(current_thread().getName(), \'start\') print(\'run\') print(current_thread().getName(), \'start\') t1 = Mythread() t1.start() t1.join() # 线程同步 print(current_thread().getName(), \'end\') # 9.python正则表达式re # . 匹配任意单个字符 # ^ 以什么字符做开头 # $ 以什么字符做结尾(从后向前进行匹配) # * 字符出现0~n次 # + 前面字符出现1~N次 # ? 前面字符出现0次或1次 # {m} 前面字符出现m的次 # {m,n} 前面字符出现m~n次 # [] 中括号中任意一个字符匹配成功即可 # | 字符选择左边或者右边 # \d 匹配内容为数字 # \D 匹配非数字 # \s 匹配字符串 # () 进行分组 import re p = re.compile(\'.{3}\') # 任意字符出现三次 print(p.match(\'d\')) p1 = re.compile(\'jpg$\') # 查找以jpg结尾的字符 print(p1.match(\'d\')) p2 = re.compile(\'ca*\') # 查找以jpg结尾的字符 print(p2.match(\'cat\')) p3 = re.compile(\'a{4}\') # 查找a出现4次 print(p3.match(\'caaaat\')) p4 = re.compile(\'c[bcd]t\') # 出现bcd中任意一个 print(p4.match(\'cat\')) # 分组 p5 = re.compile(r\'(\d+)-(\d+)-(\d+)\') print(p5.match(\'2019-02-02\')) # 匹配日期 print(p5.match(\'2019-02-02\').group(1)) # 匹配年份 year, month, day = p5.match(\'2019-02-02\').groups() # 匹配年份 print(year, month, day) # match是完全匹配进行分组,search是进行字符匹配搜索 print(p5.match(\'aaa2019-02-02\')) print(p5.search(\'aaa2019-02-02\')) # sub匹配替换 phone = \'123-456-789 # 这是电话号码\' print(re.sub(r\'#.*$\', \'\', phone)) # 将警号后面替换为空 print(re.sub(r\'\D\', \'\', phone)) # 非数字替换为空 # 10. python日期函数函数库 # import time print(time.time()) # 1970年到现在的时间 print(time.localtime()) print(time.strftime(\'%Y-%m-%d %H:%M:%S\')) import datetime # datetime用作时间的修改 print(datetime.datetime.now()) new_time = datetime.timedelta(minutes=10) print(datetime.datetime.now() + new_time) # 十分钟之后的时间 one_day = datetime.datetime(2019, 9, 9) new_day = datetime.timedelta(days=10) print(one_day + new_day) # 11.网页数据采集与urllib from urllib import request url = \'http://www.baidu.com\' response = request.urlopen(url, timeout=1) # print(response.read().decode(\'utf-8\')) # 12.GET和POST请求 from urllib import parse from urllib import request data = bytes(parse.urlencode({\'world\': \'hello\'}), encoding=\'utf8\') # print(data) response = request.urlopen(\'http://httpbin.org/post\', data=data) # print(response.read().decode(\'utf-8\')) import urllib import socket try: response2 = request.urlopen(\'http://httpbin.org/get\', timeout=1) # print(response2.read()) except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout): print("time out") # 13.python的requests库的使用 # get请求 import requests url2131 = \'http://httpbin.org/get\' data2131 = {\'key\': \'value\', \'abc\': \'xyz\'} response2131 = requests.get(url2131, data2131) # print(response2131.text) # post请求 url2132 = \'http://httpbin.org/post\' data2132 = {\'key\': \'value\', \'abc\': \'xyz\'} response2132 = requests.post(url2132, data2132) # print(response2132.json()) # 14.python的正则表达式爬取链接 # import requests # import re content = requests.get(\'http://www.cnu.cc/discoveryPage/hot-人像\').text # print(content) patter2141 = re.compile(r\'<a href="(.*?)".*?title">(.*?)</div>\', re.S) results2141 = re.findall(patter2141, content) # print(\'ssssss\', results2141) for result2141 in results2141: url2141, name2141 = result2141 # print(url2141, re.sub(\'\s\', \'\', name2141)) # 15.爬蟲使用beautiful Soup的安装使用 # pip3 install bs4 from bs4 import BeautifulSoup soup = BeautifulSoup(content, \'lxml\') # print(soup.prettify()) # 格式化的处理 # print(soup.title) # 获取title # print(soup.title.string) # 获取title # print(soup.p) # 获取p标签 # print(soup.a) # 获取a标签 # print(soup.find(id=\'link3\')) # 获取id=link3的标签 # 查找所有a标签的链接 # for link in soup.find_all(\'a\'): # print(link.get(\'href\')) # print(soup.get_text()) # 获取文档中所有文本内容 # 16.爬虫网页标题 # from bs4 import BeautifulSoup # import requests headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "close", "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", "Referer": "http://www.infoq.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" } url2161 = \'https://www.infoq.com/news/\' # 取得网页完整内容 def craw(url2162): response2162 = requests.get(url2162, headers=headers) print(response2162.text) # craw(url2161) # 取得新闻标题 def craw2(url2163): response2163 = requests.get(url2163, headers=headers) soup2163 = BeautifulSoup(response2163.text, \'lxml\') for title_href in soup2163.find_all(\'div\', class_=\'items__content\'): print([title.get(\'title\') for title in title_href.find_all(\'a\') if title.get(\'title\')]) # craw2(url2161) # # 翻页 # for i in range(15, 46, 15): # url2164 = \'http://www.infoq.com/news/\' + str(i) # # print(url) # craw2(url2164) # 17.python爬虫爬取图片下载 from bs4 import BeautifulSoup import requests import os import shutil headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "close", "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", "Referer": "http://www.infoq.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" } url = \'http://www.infoq.com/presentations\' # 下载图片 # Requests 库封装复杂的接口,提供更人性化的 HTTP 客户端,但不直接提供下载文件的函数。 # 需要通过为请求设置特殊参数 stream 来实现。当 stream 设为 True 时, # 上述请求只下载HTTP响应头,并保持连接处于打开状态, # 直到访问 Response.content 属性时才开始下载响应主体内容 def download_jpg(image_url, image_localpath): response = requests.get(image_url, stream=True) if response.status_code == 200: with open(image_localpath, \'wb\') as f: response.raw.deconde_content = True shutil.copyfileobj(response.raw, f) # 取得演讲图片 def craw3(url): response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, \'lxml\') for pic_href in soup.find_all(\'div\', class_=\'items__content\'): for pic in pic_href.find_all(\'img\'): imgurl = pic.get(\'src\') dir = os.path.abspath(\'.\') filename = os.path.basename(imgurl) imgpath = os.path.join(dir, filename) print(\'开始下载 %s\' % imgurl) download_jpg(imgurl, imgpath) # craw3(url) # 翻页 j = 0 for i in range(12, 37, 12): url = \'http://www.infoq.com/presentations\' + str(i) j += 1 print(\'第 %d 页\' % j) craw3(url)