Internet - 爱码网

0x01 URL的解析/反解析/连接

解析

urlparse()--分解URL

# -*- coding: UTF-8 -*-

from urlparse import urlparse

url = \'http://user:pwd@NetLoc:80/p1;param/p2?query=arg#frag\'
parsed = urlparse(url)
print parsed

print parsed.scheme
print parsed.netloc
print parsed.path
print parsed.params
print parsed.query
print parsed.fragment
print parsed.username
print parsed.password
print parsed.hostname,\'(netloc in lowercase)\'
print parsed.port

View Code

urlsplit()--替换urlparse()，但不会分解参数。(没有params属性)

# -*- coding: UTF-8 -*-

from urlparse import urlsplit

url = \'http://user:pwd@NetLoc:80/p1;param/p2?query=arg#frag\'
parsed = urlsplit(url)
print parsed

print parsed.scheme
print parsed.netloc
print parsed.path
print parsed.params
print parsed.query
print parsed.fragment
print parsed.username
print parsed.password
print parsed.hostname,\'(netloc in lowercase)\'
print parsed.port

urlsplit()示例

urldefrag()--从URL中剥离出片段标识符

# -*- coding: UTF-8 -*-

from urlparse import urldefrag

url = \'http://NetLoc/path;param?query=arg#frag\'

print \'original :\',url
url,fragment = urldefrag(url)
print \'url :\',url
print \'fragment :\',fragment

urldefrag()示例

反解析

geturl()--只适用于urlparse()或urlsplit()返回的对象

# -*- coding: UTF-8 -*-

from urlparse import urlparse

url = \'http://NetLoc/path;param?query=arg#frag\'

print \'original :\',url
parsed = urlparse(url)

print \'after geturl() :\',parsed.geturl()

urlsplit()示例

urlunparse()--将包含串的普通元组拼接成一个URL（如果输入URL包含多余部分，重新构造的URL可能会将其去除）

# -*- coding: UTF-8 -*-

from urlparse import urlparse,urlunparse

url = \'http://NetLoc/path;param?query=arg#frag\'

print \'ORIGINAL URL:\',url
parsed = urlparse(url)
print \'PARSED :\',type(parsed),parsed
t = parsed[:]
print \'TUPLE :\',type(t),t
print \'NEW :\',urlunparse(t)

urlunparse()示例

连接

urljoin()--由相对片段构造绝对URL

# -*- coding: UTF-8 -*-

from urlparse import urljoin

print urljoin(\'http://www.example.com/path/file.html\',\'anotherfile.html\')
print urljoin(\'http://www.example.com/path/file.html\',\'../anotherfile.html\')

urljoin()相对路径示例

# -*- coding: UTF-8 -*-

from urlparse import urljoin

print urljoin(\'http://www.example.com/path/\',\'/subpath/file.html\')
print urljoin(\'http://www.example.com/path/\',\'subpath/file.html\')

urljoin()非相对路径示例

注:如果连接到URL的路径以斜线开头(/)，这会将URL的路径重置为顶级路径。如果不是以一个斜线开头，则追加到当前URL路径的末尾。

0x02 BaseHTTPServer--实现web服务器的基类

HTTP GET

下面一个示例展示了一个请求处理器如何向客户返回一个响应

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import BaseHTTPRequestHandler
 4 import urlparse
 5 
 6 class GetHandler (BaseHTTPRequestHandler):
 7     def do_GET(self):
 8         parsed_path = urlparse.urlparse(self.path)
 9         message_parts = [
10                 \'CLIENT VALUES:\',
11                 \'client_address=%s (%s)\' % (self.client_address,
12                                             self.address_string()),
13                 \'command=%s\' % self.command,
14                 \'path=%s\' % self.path,
15                 \'real_path=%s\' % parsed_path.path,
16                 \'query=%s\' % self.request_version,
17                 \'\',
18                 \'SERVER VALUES:\',
19                 \'server_version=%s\' % self.server_version,
20                 \'sys_version=%s\' % self.sys_version,
21                 \'protocol_version=%s\' % self.protocol_version,
22                 \'\',
23                 \'HEADERS RECEIVED:\',
24             ]
25         for name,value in sorted(self.headers.items()):
26             message_parts.append(\'%s=%s\' % (name,value.rstrip()))
27         message_parts.append(\'\')
28         message = \'\r\n\'.join(message_parts)
29         self.send_response(200)
30         self.end_headers()
31         self.wfile.write(message)
32         return
33         
34 if __name__ == \'__main__\':
35     from BaseHTTPServer import HTTPServer
36     server = HTTPServer((\'localhost\',8080),GetHandler)
37     print \'Starting server,use <Ctrl+C> to stop\'
38     server.serve_forever()

View Code

HTTP POST

支持POST请求需要多做一些工作，因为基类不会自动解析表单数据。cgi模块提供了FieldStorage类，如果给定了正确的输入，它知道如何解析表单。

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import BaseHTTPRequestHandler
 4 import cgi
 5 
 6 
 7 class PostHandler(BaseHTTPRequestHandler):
 8     def do_POST(self):
 9         # parse the form data posted
10         form = cgi.FieldStorage(
11             fp=self.rfile,
12             headers=self.headers,
13             environ={\'REQUEST_METHOD\': \'POST\',
14                      \'CONTENT_TYPE\': self.headers[\'Content-Type\'],
15                      })
16 
17         # begin the response
18         self.send_response(200)
19         self.end_headers()
20         self.wfile.write(\'Client:%s\n\' % str(self.client_address))
21         self.wfile.write(\'User-agent:%s\n\' % str(self.headers[\'user-agent\']))
22         self.wfile.write(\'Path:%s\n\' % self.path)
23         self.wfile.write(\'Form data:\n\')
24 
25         # Echo back information about what was posted in the form
26         for field in form.keys():
27             field_item = form[field]
28             if field_item.filename:
29                 # the field contains an uploaded file
30                 file_data = field_item.file.read()
31                 file_len = len(file_data)
32                 del file_data
33                 self.wfile.write(
34                     \'\tUpload %s as "%s" (%d bytes)\n\' % (field, field_item.filename, file_len))
35             else:
36                 # regular form values
37                 self.wfile.write(\'\t%s=%s\n\' % (field, form[field].value))
38 
39         return
40 
41 
42 if __name__ == \'__main__\':
43     from BaseHTTPServer import HTTPServer
44     server = HTTPServer((\'localhost\', 8080), PostHandler)
45     print \'starting sever,use <Ctrl+Z> to stop\'
46     server.serve_forever()

View Code

线程与进程

HTTPServer是SocketServer.TCPServer的一个子类，不使用多线程或者多进程来处理请求。要增加线程或进程，需要使用相应的mix-in技术从SocketServer创建一个新类。

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
 4 from SocketServer import ThreadingMixIn
 5 import threading
 6 
 7 class Handler(BaseHTTPRequestHandler):
 8     def do_GET(self):
 9         self.send_response(200)
10         self.end_headers()
11         message=threading.currentThread().getName()
12         self.wfile.write(message)
13         self.wfile.write(\'\n\')
14         return
15 
16 class ThreadedHTTPServer(ThreadingMixIn,HTTPServer):
17     """Handler requests in a separate thread."""
18     
19 if __name__ == \'__main__\':
20     server = ThreadedHTTPServer((\'localhost\',8080),Handler)
21     print \'Starting server, use <Ctrl+C> to stop\'
22     server.serve_forever()

View Code

处理错误

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import BaseHTTPRequestHandler
 4 
 5 class ErrorHandler(BaseHTTPRequestHandler):
 6     def do_GET(self):
 7         self.send_error(404)
 8         return
 9 
10 if __name__ == \'__main__\'
11     from BaseHTTPServer import HTTPServer
12     server = HTTPServer((\'localhost\',8080),ErrorHandler)
13     print \'Starting server, use <Ctrl+C> to stop\'
14     server.serve_forever()

View Code

设置首部

send_header()方法将向HTTP响应添加首部数据。

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import BaseHTTPRequestHandler
 4 import urlparse
 5 import time
 6 
 7 class GetHandler(BaseHTTPRequestHandler):
 8     def do_GET(self):
 9         self.send_response(200)
10         self.send_header(\'Last-Modified\',
11                           self.date_time_string(time.time()))
12         self.end_headers()
13         self.wfile.write(\'Response body \n\')
14         return
15 
16 if __name__ == \'__main__\':
17     from BaseHTTPServer import HTTPServer
18     server = HTTPServer((\'localhost\',8080),GetHandler)
19     print \'Starting server, use <Ctrl+C> to stop\'
20     server.serve_forever()

send_header()

0x03 urllib--网络资源访问

作用：访问不需要验证的远程资源/coocie等等。

利用缓存实现简单获取

urllib提供的urlretrieve()函数提供下载数据的功能。参数:1.URL 2.存放数据的一个临时文件和一个报告下载进度的函数。另外如果UTL指示一个表单，要求提交数据，那么urlretrieve()还有有一个参数表示要传递的数据。调用程序可以直接删除这个文件，或者将这个文件作为一个缓存，使用urlcleanup()将其删除。

使用一个HTTP GET请求从一个web服务器获取数据的例子:

 1 # -*- coding: UTF-8 -*-
 2 
 3 import urllib
 4 import os
 5 
 6 def reporthook(blocks_read, block_size, total_size):
 7     """total_size is reported in bytes,
 8     block_size is the amount read each time.
 9     blocks_read is the number of blocks successfully read.
10     """
11     if not blocks_read:
12         print \'Connection opened\'
13         return
14     if total_size < 0:
15         #unknown size
16         print \'Read %d blocks (%d bytes)\' % (blocks_read,blocks_read* block_size)
17     else:
18         amount_read = blocks_read * block_size
19         print \'Read %d blocks, or %d/%d\' % (blocks_read,amount_read,total_size)
20     return
21 
22 try:
23     filename,msg = urllib.urlretrieve(\'http://blog.doughellmann.com/\', reporthook=reporthook)
24     
25     print
26     print \'File:\',filename
27     print \'Headers:\'
28     print msg
29     print \'File exists before cleanup:\', os.path.exists(filename)
30     
31 finally:
32     urllib.urlcleanup()
33     print \'File still exists:\', os.path.exists(filename)

View Code

参数编码

对参数编码并追加到URL，从而将它们传递到服务器。(error)

 1 # -*- coding: UTF-8 -*-
 2 
 3 import urllib
 4 
 5 query_args = {\'q\':\'query string\',\'foo\':\'bar\'}
 6 encoded_args = urllib.urlencode(query_args)
 7 print \'Encoded:\', encoded_args
 8 
 9 url = \'http://localhost:8080/?\' + encoded_args
10 print urllib.urlopen(url).read()

View Code

要使用变量的不同出现向查询串传入一个值序列，需要在调用urlencode()时将doseq设置为True。

1 # -*- coding: UTF-8 -*-
2 
3 import urllib
4 
5 query_args = {\'foo\':[\'foo1\',\'foo2\']}
6 print \'Single :\',urllib.urlencode(query_args)
7 print \'Sequence:\',urllib.urlencode(query_args,doseq=True)

View Code

结果时一个查询串，同一个名称与多个值关联。

查询参数中可能有一些特殊字符，在服务器端对URL解析时这些字符会带来问题，所以在传递到urlencode()时要对这些特殊字符"加引号"。要在本地对特殊字符加引号从而得到字符串的“安全”版本。

直接使用quote()或quote_plus()函数。

1 # -*- coding: UTF-8 -*-
2 
3 import urllib
4 
5 url = \'http://localhost:8080/~dhellmann/\'
6 print \'urlencode() :\',urllib.urlencode({\'url\':url})
7 print \'quote()     :\',urllib.quote(url)
8 print \'quote_plus():\',urllib.quote_plus(url)

View Code

加引号的逆过程

相应的使用unquote()或unquote_plus()函数。

1 # -*- coding: UTF-8 -*-
2 
3 import urllib
4 
5 print urllib.unquote(\'http%3A//localhost%3A8080/%7Edhellmann/\')
6 print urllib.unquote_plus(\'http%3A%2F%2Flocalhost%3A8080%2F%7Edhellmann%2F\')

View Code

路径与URL

有些操作系统在本地文件和URL中使用不同的值分隔路径的不同部分。为了保证代码可移植，可以使用函数pathname2url()和url2pathname()来回转换。

 1 # -*- coding: UTF-8 -*-
 2 
 3 import os
 4 from urllib import pathname2url,url2pathname
 5 
 6 print \'== Default ==\'
 7 path = \'/a/b/c\'
 8 print \'Original:\',path
 9 print \'URL:\',pathname2url(path)
10 print \'Path:\',url2pathname(\'/d/e/f\')
11 
12 
13 print \'== Windows,without drive letter ==\'
14 path = r\'\a\b\c\'
15 print \'Original:\',path
16 print \'URL:\',pathname2url(path)
17 print \'Path:\',url2pathname(\'/d/e/f\')
18 print 
19 
20 print \'== Windows, with drive letter ==\'
21 path = r\'C:\a\b\c\'
22 print \'Original:\',path
23 print \'URL:\',pathname2url(path)
24 print \'Path:\',url2pathname(\'/d/e/f\')
25 print

View Code

0x04 urllib2--网络资源访问

作用：用于打开扩展URL的库，这些URL可以通过定义定制协议处理器来扩展。

urllib2模块提供了一个更新的API来使用URL标识的Internet资源。

HTTP GET

...临场error

0x05 Base64--用ASCLL编码二进制数据

base64编码

 1 # -*- coding: UTF-8 -*-
 2 
 3 import base64
 4 import textwrap
 5 
 6 #load this sourse file and strip the header.
 7 with open(__file__,\'rt\') as input:
 8     raw = input.read()
 9     initial_data = raw.split(\'#end_pymotw_headers\')[1]
10     
11 encoded_data = base64.b64encode(initial_data)
12 
13 num_initial = len(initial_data)
14 
15 #there will never be more than 2 padding bytes.
16 padding = 3 - (num_initial %3)
17 
18 print \'%d bytes before encoding\' % num_initial
19 print \'Expect %d padding bytes\' % padding
20 print \'%d bytes after encoding\' % len(encoded_data)
21 print 
22 print encoded_data

View Code

base64解码

 1 # -*- coding: UTF-8 -*-
 2 
 3 import base64
 4 
 5 original_string = \'this is the data, in the clear.\'
 6 print \'Original:\' , original_string
 7 encoded_string = base64.b64encode(original_string)
 8 
 9 print \'Encoded:\',encoded_string
10 
11 decoded_string = base64.b64decode(encoded_string)
12 print \'Decoded:\',decoded_string

View Code

URL安全的变种

因为默认的Base64字母表可能使用+和/，这两个字符在URL中会用到，所以通常很必要使用一个候选编码来替换这些字符。+替换成-，/替换成下划线_

 1 # -*- coding: UTF-8 -*-
 2 
 3 import base64
 4 
 5 encodes_with_pluses = chr(251) + chr(239)
 6 encodes_with_slashes = chr(255) * 2
 7 
 8 for original in [encodes_with_pluses,encodes_with_slashes]:
 9     print \'Original          :\',repr(original)
10     print \'Standard encodingL:\',base64.standard_b64encode(original)
11     print \'UTL-safe encoding :\',base64.urlsafe_b64encode(original)
12     print

View Code

其他编码

 1 # -*- coding: UTF-8 -*-
 2 
 3 import base64
 4 
 5 original_string = \'This is the data,in the clear.\'
 6 print \'Original:\', original_string
 7 
 8 #Base32字母表包括ASCLL集中的26个大写字母以及数字2~7
 9 encoded_string = base64.b32encode(original_string)
10 print \'Base32Encoded :\', encoded_string
11 decoded_string = base64.b32decode(encoded_string)
12 print \'Base32Decoded :\', decoded_string
13 
14 #Base16函数处理十六进制字母表
15 encoded_string = base64.b16encode(original_string)
16 print \'Base16Encoded :\',encoded_string
17 encoded_string = base64.b16decode(encoded_string)
18 print \'Base16Decoded :\',encoded_string

View Code

0x06 robotparser--网络蜘蛛访问控制

作用：解析用于控制网络蜘蛛的robots.txt文件

 1 # -*- coding: UTF-8 -*-
 2 
 3 import robotparser
 4 import urlparse
 5 
 6 AFENT_NAME = \'PyMOTW\'
 7 URL_BASE = \'http://www.doughellmann.com/\'
 8 parser = robotparser.RobotFileParser()
 9 parser.set_url(urlparse.urljoin(URL_BASE,\'robots.txt\'))
10 parser.read()
11 
12 PATHS = [
13     \'/\',
14     \'/PyMOTW/\',
15     \'/admin/\',
16     \'/downloads/PyMOTW-1.92.tar.gz\',
17     ]
18     
19 for path in PATHS:
20     print \'%6s : %s\' % (parser.can_fetch(AFENT_NAME,path),path)
21     url = urlparse.urljoin(URL_BASE,path)
22     print \'%6s : %s\' % (parser.can_fetch(AFENT_NAME,url),url)
23     print

View Code

can_fetch()的URL参数可以是一个相对于网站根目录的相对路径，也可以是一个完全URL。

长久蜘蛛

如果一个应用需要花很长时间来处理它下载的资源，或者受到抑制，需要在很多次下载之间暂停，这样的移动应当以其下载内容的寿命为根据，定期检查新的robots.txt文件。这个寿命并不是自动管理的，不过模块提供了一些简便方法，利用这些方法可以更容易地跟踪文件的寿命。

 1 # -*- coding: UTF-8 -*-
 2 
 3 import robotparser
 4 import urlparse
 5 import time
 6 
 7 AGENT_NAME = \'PyMOTW\'
 8 URL_BASE = \'http://www.doughellmann.com/\'
 9 parser = robotparser.RobotFileParser()
10 parser.set_url(urlparse.urljoin(URL_BASE,\'robots.txt\'))
11 parser.read()
12 parser.modified()
13 
14 PATHS = [
15     \'/\',
16     \'/PyMOTW/\',
17     \'/admin/\',
18     \'/downloads/PyMOTW-1.92.tar.gz\',
19     ]
20 
21 for path in PATHS:
22     age = int(time.time() - parser.mtime())
23     print \'age:\',age,
24     if age>1:
25         print \'rereading robots.txt\'
26         parser.read()
27         parser.modified()
28     else:
29         print
30     print \'%6s : %s\' % (parser.can_fetch(AGENT_NAME,path),path)
31     #Simulate delay in processing
32     time.sleep(1)
33     print

View Code

如果已下载的文件寿命超过了1秒，这个极端例子就会下载一个新的robots.txt文件。作为一个更好的长久应用，在下载整个文件之前可能会请求文件的修改世界。

0x07 Cookie--HTTP Cookie

创建和设置Cookie

1 # -*- coding: UTF-8 -*-
2 
3 import Cookie
4 
5 c = Cookie.SimpleCookie()
6 c[\'name\'] = \'p0pl4r\'
7 print c

View Code

输出是一个合法的Set-Cookie首部，可以作为HTTP响应的一部分传递到客户。

Morsel

cookie的所有RFC属性都可以通过表示cookie值的Morsel对象来管理，如到期时间/路径/域。

 1 # -*- coding: UTF-8 -*-
 2 
 3 import Cookie
 4 import datetime
 5 
 6 def show_cookie(c):
 7     print c
 8     for key,morsel in c.iteritems():
 9         print
10         print \'key=\',morsel.key
11         print \'value=\',morsel.value
12         print \'coded_value=\',morsel.coded_value
13         for name in morsel.keys():
14             if morsel[name]:
15                 print \'%s = %s\' % (name,morsel[name])
16                 
17 c = Cookie.SimpleCookie()
18 
19 #A cookie with a value that has to be encoded to fit into the headers
20 c[\'encoded_value_cookie\'] = \'"cookie_value"\'
21 c[\'encoded_value_cookie\'][\'comment\'] = \'this is cookie\\'s comment\'
22 
23 #A cookie that only applies to part of a site
24 c[\'restricted_cookie\'] = \'cookie_value\'
25 c[\'restricted_cookie\'][\'path\'] = \'/sub/path\'
26 c[\'restricted_cookie\'][\'domain\'] = \'PyMOTW\'
27 c[\'restricted_cookie\'][\'secure\'] = \'True\'
28 
29 #A cookie that expires in 5 minutes
30 c[\'with_max_age\'] = \'expires in 5 minutes\'
31 c[\'with_max_age\'][\'max-age\'] = 300 # seconds
32 
33 #A cookie that expires at a specific time
34 c[\'expires_at_time\'] = \'cookie_value\'
35 time_to_live = datetime.timedelta(hours = 1)
36 expires = datetime.datetime(2018,9,19,18,30,14)+time_to_live
37 
38 #Date format:Wdy,DD-Mon-YY HH:MM:SS: GMT
39 expires_at_time = expires.strftime(\'%a,%d %b %Y %H:%M:%S\')
40 c[\'expires_at_time\'][\'expires\'] = expires_at_time
41 show_cookie(c)

View Code