import urllib2 class UseProxy(object): def __init__(self): self.user = \'aaaa\' self.password = \'bbbb\' self.proxyserver = \'xxx.yyy.zzz:8080\' self.content = \'\' def getproxy(self): proxy = \'http://{}:{}@{}\'.format(self.user, self.password, self.proxyserver) proxy_handler = urllib2.ProxyHandler({\'http\': proxy}) opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler) # self.content = opener.open(self.url).read().decode(\'utf-8\') return opener
UseProxy
from urlparse import urljoin import re from UseProxy import * from bs4 import BeautifulSoup class GetZealerVideo(object): def __init__(self): self.url = \'http://www.zealer.com\' self.content = \'\' self.lists = [] def splitcontent(self, proxyset): # self.proxyset = UseProxy() self.content = proxyset.getproxy().open(self.url).read().decode(\'utf-8\') # self.useproxy() soup = BeautifulSoup(self.content, "html.parser") founddiv = soup.findAll(\'div\', {\'class\': \'subject\'}) foundli = soup.findAll(\'div\', {\'id\': re.compile("^li_layer")}) l = len(founddiv) self.lists = [] if l == len(foundli): for i in range(l):
b = re.findall(\'/post/\d+\'), str(foundli[i]))[1] self.lists.append(urljoin(self.url, b)) self.lists.append(founddiv[i].contents[0].encode(\'utf-8\')) return self.lists if __name__ == \'__main__\': gvideo = GetZealerVideo() proxyset = UseProxy() print \'.\'.join(gvideo.splitcontent(proxyset)).decode(\'utf-8\')
GetZealerVideo
from UseProxy import * from bs4 import BeautifulSoup class GetMydrivers(object): def __init__(self): self.url = \'http://www.mydrivers.com\' self.content = \'\' self.lists = [] def splitcontent(self, proxyset): # self.useproxy() self.content = proxyset.getproxy().open(self.url).read() soup = BeautifulSoup(self.content, "html.parser", from_encoding="gb18030") print soup.original_encoding founddiv = soup.findAll(\'span\', {\'class\': \'titl\'}) for i in range(len(founddiv)): self.lists.append(founddiv[i].contents[0]) return self.lists if __name__ == \'__main__\': gnews = GetMydrivers() proxyset = UseProxy() lists = gnews.splitcontent(proxyset) for l in lists: print str(l).decode(\'utf-8\').encode(\'gb18030\')
GetMydrivers
# -*- coding: utf-8 -*- from Tkinter import * from time import ctime import os import re import GetZealerVideo as soup import GetMydrivers as mnews from UseProxy import * class GetResource(object): def __init__(self): self.win = Tk() self.l1 = StringVar(self.win) self.msg = "" self.frame = Frame(width=800, height=600, bg=\'white\') # self.frame.grid_propagate(False) # self.frame.grid() self.frame.propagate(False) self.frame.pack() self.scroll = Scrollbar(self.frame) self.scroll.pack(side=RIGHT, fill=Y) # self.scroll.grid(row=0, column=1) self.listbox = Listbox(self.frame, selectbackground=\'blue\', font=\'12\', heigh=550, width=750, yscrollcommand=self.scroll.set, xscrollcommand=self.scroll.set) self.listbox.pack(side=TOP, fill=BOTH) # self.listbox.grid(row=0, column=0) self.listbox.bind(\'<Double-1>\', self.get_select) self.frame2 = Frame(width=800, height=50, bg=\'white\') self.frame2.propagate(False) self.frame2.pack() # self.frame2.grid_propagate(False) # self.frame2.grid() Button(self.frame2, text=u\'Get Zealer\', command=self.zealer_video).pack(expand=YES) # Button(self.frame2, text=u\'Get Zealer\', command=self.zealer_video).grid(row=0, column=0) Button(self.frame2, text=u\'Get Mydrivers\', command=self.my_drivers).pack(expand=YES) # Button(self.win, text=u\'Get Mydrivers\', command=self.my_drivers).grid(row=1, column=1) def my_drivers(self): print \'start get at:\', ctime() self.listbox.delete(0, END) self.getm = mnews.GetMydrivers() proxyset = UseProxy() for l in self.getm.splitcontent(proxyset): s = str(l).decode(\'utf-8\') try: self.listbox.insert(END, re.findall(r\'(?<=href=").+?(?=">)\', s)[0]+"\r\n") self.listbox.insert(END, re.findall(r\'(?<=>).+?(?=<)\', s)[0]+"\r\n") self.listbox.update() except IndexError: pass print \'get done at:\', ctime() def zealer_video(self): print \'start get at:\', ctime() self.listbox.delete(0, END) self.getz = soup.GetZealerVideo() proxyset = UseProxy() for l in self.getz.splitcontent(proxyset): self.listbox.insert(END, l+"\r\n") self.listbox.update() print \'get done at:\', ctime() def get_select(self, ev=None): self.listbox.config(selectbackground=\'red\') print self.listbox.curselection() self.check = self.listbox.get(self.listbox.curselection()) if self.check: if re.match(\'http\', self.check): os.startfile(self.check) def main(): d = GetResource() mainloop() if __name__ == \'__main__\': main()