在 Python 套接字中缓存 HTTP GET REQUEST答案

【问题标题】：Cache a HTTP GET REQUEST in Python Sockets在 Python 套接字中缓存 HTTP GET REQUEST
【发布时间】：2015-02-05 04:29:31
【问题描述】：

我正在使用套接字制作代理服务器。当请求的文件不在我的当前目录（缓存）中时，我会向源服务器（即 www）发出 http get 请求并将其缓存以备后用。

我的代码的问题是，每次我从 www 获取资源时，我都会缓存它，但文件的内容总是“永久移动”。

所以会发生这种情况：用户通过在浏览器中输入“localhost:8080/stackoverflow.com”来请求“stackoverlflow.com”。浏览器将正确返回页面。当用户在浏览器中第二次输入“localhost:8080/stackoverflow.com”时，浏览器会返回一个页面说stackoverflow.com已经永久移动了。

这里是http get请求和缓存方法的代码：

    @staticmethod
    def find_on_www(conn, requested_file):
        try:
            # Create a socket on the proxy server
            print 'Creating socket on proxy server'
            c = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

            host_name = requested_file.replace("www.","",1)
            print 'Host Name: ', host_name

            # Connect to the socket to port 80
            c.connect((host_name, 80))
            print 'Socket connected to port 80 of the host'

            # Create a temporary file on this socket and ask port 80
            # for the file requested by the client
            file_object = c.makefile('r', 0)
            file_object.write("GET " + "http://" + requested_file + " HTTP/1.0\n\n")

            # Read the response into buffer
            buff = file_object.readlines()

            # Create a new file in the cache for the requested file.
            # Also send the response in the buffer to client socket
            # and the corresponding file in the cache
            temp_file = open("./" + requested_file, "wb")
            for i in range(0, len(buff)):
                temp_file.write(buff[i])
                conn.send(buff[i])

            conn.close()

如果有人感兴趣，这是我的其余代码：

import socket       # Socket programming
import signal       # To shut down server on ctrl+c
import time         # Current time
import os           # To get the last-modified
import mimetypes    # To guess the type of requested file
import sys          # To exit the program
from threading import Thread


def generate_header_lines(code, modified, length, mimetype):
        """ Generates the header lines for the response message """
        h = ''

        if code == 200:
            # Append status code
            h = 'HTTP/1.1 200 OK\n'
            # Append the date

            # Append the name of the server
            h += 'Server: Proxy-Server-Thomas\n'
            # Append the date of the last modification to the file
            h += 'Last-Modified: ' + modified + '\n'

        elif code == 404:
            # Append the status code
            h = 'HTTP/1.1 404 Not Found\n'
            # Append the date
            h += 'Date: ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + '\n'
            # Append the name of the web server
            h += 'Server: Web-Server-Thomas\n'

        # Append the length of the content
        h += 'Content-Length: ' + str(length) + '\n'
        # Append the type of the content
        h += 'Content-Type: ' + mimetype + '\n'
        # Append the connection closed - let the client know we close the connection
        h += 'Connection: close\n\n'

        return h


def get_mime_type(requested_file):
    # Get the file's mimetype and encoding
    try:
        (mimetype, encoding) = mimetypes.guess_type(requested_file, True)
        if not mimetype:
            print "Mimetype found: text/html"
            return 'text/html'
        else:
            print "Mimetype found: ", mimetype
            return mimetype

    except TypeError:
        print "Mimetype found: text/html"
        return 'text/html'


class WebServer:
    def __init__(self):
        """
        Constructor
        :return:
        """
        self.host = ''      # Host for the server
        self.port = 8000    # Port for the server

        # Create socket
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    def start_server(self):
        """ Starts the server
        :return:
        """
        # Bind the socket to the host and port
        self.socket.bind((self.host, self.port))

        print "Connection started on ", self.port

        # Start the main loop of the server - start handling clients
        self.main_loop()

    @staticmethod
    def shutdown():
        """ Shuts down the server """
        try:
            s.socket.close()
        except Exception as e:
            print "Something went wrong closing the socket: ", e

    def main_loop(self):
        """Main loop of the server"""
        while True:
            # Start listening
            self.socket.listen(1)

            # Wait for a client to connect
            client_socket, client_address = self.socket.accept()

            # Wait for a request from the client
            data = client_socket.recv(1024)

            t = Thread(target=self.handle_request, args=(client_socket, data))
            t.start()

            # # Handle the request from the client
            # self.handle_request(client_socket, data)

    def handle_request(self, conn, data):
        """ Handles a request from the client """
        # Decode the data
        string = bytes.decode(data)

        # Split the request
        requested_file = string.split(' ')
        # Get the method that is requested
        request_method = requested_file[0]

        if request_method == 'GET':
            # Get the part of the request that contains the name
            requested_file = requested_file[1]
            # Get the name of the file from the request
            requested_file = requested_file[1:]

            print "Searching for: ", requested_file

            try:
                # Open the file
                file_handler = open(requested_file, 'rb')
                # Get the content of the file
                response_content = file_handler.read()
                # Close the handler
                file_handler.close()

                # Get information about the file from the OS
                file_info = os.stat(requested_file)
                # Extract the last modified time from the information
                time_modified = time.ctime(file_info[8])
                # Get the time modified in seconds
                modified_seconds = os.path.getctime(requested_file)

                print "Current time: ", time.time()
                print "Modified: ", time_modified

                if (float(time.time()) - float(modified_seconds)) > 120:  # more than 2 minutes
                    print "Time outdated!"
                    #self.find_on_www(conn, requested_file)

                # Get the file's mimetype and encoding
                mimetype = get_mime_type(requested_file)

                print "Mimetype = ", mimetype

                # Create the correct header lines
                response_headers = generate_header_lines(200, time_modified, len(response_content), mimetype)

                # Create the response to the request
                server_response = response_headers.encode() + response_content

                # Send the response back to the client
                conn.send(server_response)

                # Close the connection
                conn.close()

            except IOError:  # Couldn't find the file in the cache - Go find file on www
                print "Error: " + requested_file + " not found in cache!"
                self.find_on_www(conn, requested_file)

    @staticmethod
    def find_on_www(conn, requested_file):
        try:
            # Create a socket on the proxy server
            print 'Creating socket on proxy server'
            c = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

            host_name = requested_file.replace("www.","",1)
            print 'Host Name: ', host_name

            # Connect to the socket to port 80
            c.connect((host_name, 80))
            print 'Socket connected to port 80 of the host'

            # Create a temporary file on this socket and ask port 80
            # for the file requested by the client
            file_object = c.makefile('r', 0)
            file_object.write("GET " + "http://" + requested_file + " HTTP/1.0\n\n")

            # Read the response into buffer
            buff = file_object.readlines()

            # Create a new file in the cache for the requested file.
            # Also send the response in the buffer to client socket
            # and the corresponding file in the cache
            temp_file = open("./" + requested_file, "wb")
            for i in range(0, len(buff)):
                temp_file.write(buff[i])
                conn.send(buff[i])

            conn.close()

        except Exception as e:
            # Generate a body for the file - so we don't have an empty page
            response_content = "<html><body><p>Error 404: File not found</p></body></html>"

            # Generate the correct header lines
            response_headers = generate_header_lines(404, '', len(response_content), 'text/html')

             # Create the response to the request
            server_response = response_headers.encode() + response_content

            # Send the response back to the client
            conn.send(server_response)

            # Close the connection
            conn.close()


def shutdown_server(sig, dummy):
    """ Shuts down the server """

    # Shutdown the server
    s.shutdown()

    # exit the program
    sys.exit(1)

# Shut down on ctrl+c
signal.signal(signal.SIGINT, shutdown_server)

# Create a web server
s = WebServer()
# Start the server
s.start_server()

【问题讨论】：

我在尝试使用 Firefox 33 时无法获得相同的结果。相反，在第二次尝试连接时，系统询问我是否希望下载该页面，因为它认为它是一个 Windows 可执行文件。这是因为您的代码根据扩展名确定的文件类型返回 mimetype，.com 是 Windows 可执行文件。
正如您在我的 get_mime_type 函数中看到的那样，如果 mimetypes 库无法猜测 mime 类型，我将只返回“text/html”。所以 .com 将返回 'text/html' 并且它不应该认为它是一个可执行文件。但也许这不是正确的做法，您有什么建议吗？
当您访问实际的 Web 服务器时，我会读取请求标头中返回的 mimetype（在 Content-Type: 中给出），将其存储在某处，然后在您重新创建标头时使用它从缓存中返回版本。

标签： python sockets http server

【解决方案1】：

您的代码的问题在于，如果您转到返回状态代码为 301 页面已移动的页面时，它会将其添加到标题中。当您查看未存储在缓存中的页面时，您会将代理服务器直接向客户端发出的 GET 请求复制。这将通知客户端发出另一个 GET 请求，它会忽略您的代理服务器。

您第二次尝试通过代理服务器请求页面时，它会从缓存中检索上一个请求。此文件包含上一个请求的标头，其中正确包含重定向状态代码，但是您随后将自己的状态代码 200 ok 添加到返回的消息中。当客户端首先读取此状态代码时，它并没有意识到您希望它发出另一个请求以查找已重定向的页面。因此，它只显示告诉您页面已移动的页面。

当代理服务器必须查看 Internet 上的实际页面时，您需要做的是解析 Web 服务器返回的标头。然后根据这些服务器将正确的标头返回给客户端。

【讨论】：