关于爬虫自己一直在看,所以时间太慢,这才第二更,有等不及的小伙伴可以慢慢的品尝了,在看下面的之前,建议先把上一章看一下。以下是关于python的Urllib的基础和高级用法。

1、如何扒下一个网站,用最短的时间

每一个网站都是根据url获取页面信息,页面信息就是通过一段html代码,加js、css。html是骨架,js是肌肉,css是衣服,一下写一个简单的实例:

wolf@ubuntu-python:~/python$ sudo vi demo.py 
#!/usr/bin/python
#coding:utf-8
import urllib2  #导入urllib2库

response = urllib2.urlopen("http://www.baidu.com")  #获取百度url    
print response.read()   #读取url函数

这就是一个简单的爬虫,保存运行一下看看都爬取了什么。

 1 wolf@ubuntu-python:~/python$ python demo.py 
 2 <html>
 3 <head>
 4 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 5 <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
 6 <META HTTP-EQUIV="Cache-Control" CONTENT="no-cache">
 7 <META HTTP-EQUIV="Expires" CONTENT="0">
 8 <title>百度一下,你就知道</title>
 9 </head>
10 <body>
11 <script>
12 (function(d){
13 function isCkie(){
14     var isSupport=false;
15     if(typeof(navigator.cookieEnabled)!='undefined'){
16         isSupport=navigator.cookieEnabled;
17         }
18         return isSupport;
19 }
20 
21 //Cookie相关函数
22 var sCkie=new function(){
23 //过期时间
24 this.expTime=function(millisecond){if(millisecond.length==0){millisecond=0};var exp=new Date();exp.setTime(exp.getTime()+parseInt(millisecond));return exp.toGMTString();};
25 //创建cookie
26 this.add=function(name,value,expires,path,domain,secure){d.cookie=name+"="+encodeURI(value)+(expires?(';expires='+expires):'')+(path?(';path='+path):'')+(domain?(';domain='+domain):'')+((secure)?';secure':'');};
27 //删除cookie
28 //this.del=function(name,path,domain){if(getCookie(name)){document.cookie=name+"="+((path)?(";path="+path):'')+((domain)?(";domain="+domain):'')+";expires=Mon,01-Jan-2006 00:00:01 GMT";}};
29 //获取cookie
30 this.get=function(name){var arg=name+"=";var alen=arg.length;var theCookie=''+d.cookie;var inCookieSite=theCookie.indexOf(arg);if(inCookieSite==-1||name==""){return '';}var begin=inCookieSite+alen;var end=theCookie.indexOf(';',begin);if(end==-1){end=theCookie.length;}return decodeURI(theCookie.substring(begin,end));};
31 };
32 var gUrl;
33 if(isCkie()){
34         var ckie=0;
35         var sid='lpvt_f6461a705844d7177814f8a1aa45aaa5';
36         var skie=sCkie.get(sid);
37         if (skie!='')
38         {
39                 ckie=parseInt(skie);
40         }
41 
42         if(ckie<1){
43                 var rand=Math.random();
44                 if(rand<1.8){
45                         gUrl="https://www.baidu.com/index.php?tn=01025065_7_pg";
46                 }else{
47                         gUrl="https://www.baidu.com/";
48                 }
49                 sCkie.add(sid,'1',sCkie.expTime(30*60*1000),0,0,0);
50         }else{
51                 gUrl="https://www.baidu.com/";
52         }
53 }else{
54         gUrl="https://www.baidu.com/";
55 }
56 (function(u){if(window.navigate&&typeof navigate=='function')navigate(u);var ua=navigator.userAgent;if(ua.match(/applewebkit/i)){var h = document.createElement('a');h.rel='noreferrer';h.href=u;document.body.appendChild(h);var evt=document.createEvent('MouseEvents');evt.initEvent('click', true,true);h.dispatchEvent(evt);}else{document.write('<meta http-equiv="Refresh" Content="0; Url='+u+'" >');}})(gUrl);
57 })(document);
58 </script>
59 </body>
60 </html>
61 
62 wolf@ubuntu-python:~/python$ 
demo

相关文章:

  • 2021-12-21
  • 2021-11-10
  • 2021-09-27
  • 2021-12-23
猜你喜欢
  • 2021-10-23
  • 2022-12-23
  • 2022-12-23
  • 2022-12-23
  • 2021-05-26
  • 2021-09-19
  • 2022-12-23
相关资源
相似解决方案