提取第三张报告的各种信息。使用正则和xpath方法。
# coding=utf8 import re,json,os from lxml import etree from collections import OrderedDict from common import html,LoggerUntil,handle_parse_exception from html_processor import HtmlProcessor logger = LoggerUntil(name="crcc_paser").getlog(logfilename=\'crcc_paser.log\',loglevel=2,add_StreamHandler=1) class HtmlFileMixin(object): def save_to_file(self): with open(self.create_file(),\'wb\') as f: f.write(self.html) def create_file(self): if not os.path.exists(\'htmldir\'): os.mkdir(\'htmldir\') html_file_name = \'htmldir\' + \'/\' + self.name + \'.html\' return html_file_name class CrccPaser(HtmlFileMixin): def __init__(self,html,name): self.html = html self.name = name self.data = OrderedDict() self.data[\'name\'] = name self.selector = None self.text = self._get_text() self.get_selector() def _get_text(self): text = self.html.decode(\'utf8\') return text def get_selector(self): self.selector = etree.HTML(self.text) def extract_user_info(self): self.data[\'report_no\'] = re.search(u\'报告编号:(.*?)</strong>\', self.text).group(1).strip() self.data[\'query_time\'] = re.search(u\'查询时间:(.*?)</strong>\', self.text).group(1).strip() self.data[\'report_time\'] = re.search(u\'报告时间:(.*?)</strong>\', self.text).group(1).strip() self.data[\'crcc_name\'] = re.search(u\'姓名:(.*?)</strong>\', self.text).group(1).strip() self.data[\'id_type\'] = re.search(u\'证件类型:(.*?)</strong>\', self.text).group(1).strip() self.data[\'id_no\'] = re.search(u\'证件号码:(.*?)</strong>\', self.text).group(1).strip() def extract_summary_information(self): account_num = re.search( u\'<tr>\s*?<td align="left" class="p">\s*? 账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>\', self.text).groups() self.data[\'account_num\'] = self._init_num_dict(account_num) uncleared_num = re.search( u\'<tr>\s*?<td align="left" class="p">\s*? 未结清/未销户账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>\', self.text).groups() self.data[\'uncleared_num\'] = self._init_num_dict(uncleared_num) overdue_num = re.search( u\'<tr>\s*?<td align="left" class="p">\s*? 发生过逾期的账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>\', self.text).groups() self.data[\'overdue_num\'] = self._init_num_dict(overdue_num) overdue90_num = re.search( u\'<tr>\s*?<td align="left" class="p">\s*? 发生过90天以上逾期的账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>\', self.text).groups() self.data[\'overdue90_num\'] = self._init_num_dict(overdue90_num) assure_num = re.search( u\'<tr>\s*?<td align="left" class="p">\s*? 为他人担保笔数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>\', self.text).groups() self.data[\'assure_num\'] = self._init_num_dict(assure_num) @staticmethod def _init_num_dict(num_tuple): num_dict = {} num_dict[\'credit_card\'],num_dict[\' home_loans\'],num_dict[\'other_loans\'] = num_tuple return num_dict def extract_all_loan_information(self): all_loan_information = re.findall(u\'<li\s*?style="list-style-type: decimal; list-style-position: outside">\s*?(\S*?)\s*?</li>\', self.text) self.data[\'all_loan_information\'] = all_loan_information def extract_public_records(self): if not re.search(u\'系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。\',self.text): public_records=self.selector.xpath(\'//table[@align="center"]//table[5]/tbody/tr[3]/td\')[0].strip() # 还不能确定具体格式,有可能造成解析中断出错 #TODO else: public_records=[u\'系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。\'] self.data[\'public_records\'] = public_records def extract_query_records(self): if not re.search(u\'系统中没有您的信用报告最近2年被查询的记录。\',self.text): #query_records=selector.xpath(\'//table[@align="center"]//table[6]/tbody/tr[3]/td\')[0].strip() query_records = re.findall( u\'<tr align="center">[\s\S]*?td class="p">\s*(.*?)\s*?</td>[\s\S]*?<td class="p">\s*(.*?日)\s*?</td>[\s\S]*?<td class="p">\s*(.*?)\s*?</td>[\s\S]*?<td class="p">\s*(.*?)\s*?</td>[\s\S]*?</tr>\', self.text) query_records = self._init_query_records(query_records) else: query_records =[u\'系统中没有您的信用报告最近2年被查询的记录。\'] self.data[\'query_records\'] = query_records @staticmethod def _init_query_records(query_records): """ :type query_records : list """ query_records_list = [] for record_tuple in query_records: query_record_dict = OrderedDict() query_record_dict[\'no\'], query_record_dict[\'query_date\'], query_record_dict[\'query_person\'],query_record_dict[\'query_reason\'] = record_tuple query_records_list.append(query_record_dict) return query_records_list @handle_parse_exception def extract_all(self): self.extract_user_info() self.extract_summary_information() self.extract_all_loan_information() self.extract_public_records() self.extract_query_records() def extract_crcc(html_str,name): htmlProcessor = HtmlProcessor(html_str,name) htmlProcessor.save_to_file() # 保存html文件 crccPaser = CrccPaser(html_str, name) # crccPaser.save_to_file() crccPaser.extract_all() logger.info(json.dumps(crccPaser.data, ensure_ascii=False)) # TODO return json.dumps(crccPaser.data, ensure_ascii=False) if __name__ == \'__main__\': extract_crcc(html,\'小明5\')
其中html第三张报告的页面源码字符串。
结果是
{"name": "小明5", "report_no": "2017122200004891965680", "query_time": "2017.12.22 11:12:32", "report_time": "2017.12.22 18:38:18", "crcc_name": "小明5", "id_type": "身份证", "id_no": "**************4337", "account_num": {" home_loans": "0", "other_loans": "2", "credit_card": "0"}, "uncleared_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "overdue_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "overdue90_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "assure_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "all_loan_information": ["2012年8月23日国家开发银行湖北省分行发放的6,000元(人民币)个人助学贷款,2014年10月已结清。", "2011年11月19日国家开发银行湖北省分行发放的6,000元(人民币)个人助学贷款,2014年10月已结清。"], "public_records": ["系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。"], "query_records": [{"no": "1", "query_date": "2017年12月4日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "2", "query_date": "2017年11月20日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "3", "query_date": "2017年11月6日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "4", "query_date": "2017年10月20日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "5", "query_date": "2017年10月10日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "6", "query_date": "2017年9月27日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "7", "query_date": "2017年9月18日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}]}
可以发送到后端py java提取,也可以在webview客户端提取,客户端提取js代码如下。
/** * Created by wj49476 on 201/3/20. */ function extractReport() { //消除空格 String.prototype.trim = function() { return this.replace(/(^\s*)|(\s*$)/g, ""); }; //正则防止出错 String.prototype.match2 = function(regObj) { var matchArray = this.match(regObj); if (matchArray && matchArray.length == 2){ return matchArray[1]; } else{ return \'\'; } }; //数组取下标,使值不为undefined Array.prototype.get = function(n) { value = this[n]; if (value === undefined){ console.debug("取下标错误"); value = \'\'; } return value; }; //css选择器 function getInnerText( cssselector){ var element = document.querySelector(cssselector); if(element){ return element.innerText; } else { console.warn("没有找到 " + cssselector + " 的元素"); return ""; } } var data = {}; data[\'SummaryInformation\'] = {}; var htmlStr = document.body.outerHTML; function extractUserInfo() { data["reportNo"] = htmlStr.match2(/报告编号:(.*?)<\/strong>/); data["queryTime"] = htmlStr.match2(/查询时间:(.*?)<\/strong>/); data[\'reportTime\'] = htmlStr.match2(/报告时间:(.*?)<\/strong>/); data[\'crccName\'] = htmlStr.match2(/姓名:(.*?)<\/strong>/); data[\'idType\'] = htmlStr.match2(/证件类型:(.*?)<\/strong>/); data[\'idNo\'] = htmlStr.match2(/证件号码:(.*?)<\/strong>/); } function extractSummaryInformation() { var accountNum = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*? 账户数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/); data[\'SummaryInformation\'][\'accountNum\'] = _initNumDict(accountNum); var unclearedNum = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*? 未结清\/未销户账户数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/); data[\'SummaryInformation\'][\'unclearedNum\'] = _initNumDict(unclearedNum); var overdueNum = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*? 发生过逾期的账户数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/); data[\'SummaryInformation\'][\'overdueNum\'] = _initNumDict(overdueNum); var overdue90Num = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*? 发生过90天以上逾期的账户数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/); data[\'SummaryInformation\'][\'overdue90Num\'] = _initNumDict(overdue90Num); var assureNum = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*? 为他人担保笔数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/); data[\'SummaryInformation\'][\'assureNum\'] = _initNumDict(assureNum); } function _initNumDict(array) { numDict = {}; numDict["creditCard"] = array.get(1); numDict["homeLoans"] = array.get(2); numDict["othreLoans"] = array.get(3); return numDict; } function extractAllLoanInformation(){ var allLoanInformationG = htmlStr.match(/<li\s*?style="list-style-type: decimal; list-style-position: outside">\s*?(\S*?)\s*?<\/li>/g); var allLoanInformation = []; for (var i=0; i< allLoanInformationG.length; i++){ allLoanInformation.push(allLoanInformationG[i].match(/<li\s*?style="list-style-type: decimal; list-style-position: outside">\s*?(\S*?)\s*?<\/li>/).get(1)); } data[\'allLoanInformation\'] = allLoanInformation; } function extractPublicRecords() { var publicRecords ; if (!(htmlStr.match(/系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。/))){ //没有具体的东西,可能不确定。 publicRecords = getInnerText(\'body > div > div > table > tbody > tr:nth-child(2) > td > table:nth-child(11)\'); }else{ publicRecords=\'系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。\'; } data[\'publicRecords\'] = publicRecords; } function extractQueryRecords() { if (!(htmlStr.match(/系统中没有您的信用报告最近2年被查询的记录。/))){ queryRecordsG = htmlStr.match(/<tr align="center">[\s\S]*?td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?日)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<\/tr>/g); //console.debug(queryRecordsG); queryRecords = []; for (var i=0; i<queryRecordsG.length; i++){ queryRecords.push(queryRecordsG[i].match(/<tr align="center">[\s\S]*?td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?日)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<\/tr>/)); } //console.debug(queryRecords); data[\'queryRecords\'] = _initQueryRecords(queryRecords); }else{ data[\'queryRecords\'] = [\'系统中没有您的信用报告最近2年被查询的记录。\']; } } function _initQueryRecords(queryRecords) { queryRecordsArray = []; for (var i=0; i<queryRecords.length; i++){ queryRecordDict = {}; queryRecordDict[\'no\'] = queryRecords[i].get(1); queryRecordDict[\'queryDate\'] = queryRecords[i].get(2); queryRecordDict[\'queryPerson\'] = queryRecords[i].get(3); queryRecordDict[\'queryReason\'] = queryRecords[i].get(4); queryRecordsArray.push(queryRecordDict); } return queryRecordsArray; } function extractReportInner() { extractUserInfo(); extractSummaryInformation(); extractAllLoanInformation(); extractPublicRecords(); extractQueryRecords(); } extractReportInner(); data[\'htmlStr\'] = Base64.encode(htmlStr); return JSON.stringify(data); } var Base64 = { // private property _keyStr: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=", // public method for encoding encode: function(input) { var output = ""; var chr1, chr2, chr3, enc1, enc2, enc3, enc4; var i = 0; input = Base64._utf8_encode(input); while (i < input.length) { chr1 = input.charCodeAt(i++); chr2 = input.charCodeAt(i++); chr3 = input.charCodeAt(i++); enc1 = chr1 >> 2; enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); enc4 = chr3 & 63; if (isNaN(chr2)) { enc3 = enc4 = 64; } else if (isNaN(chr3)) { enc4 = 64; } output = output + this._keyStr.charAt(enc1) + this._keyStr.charAt(enc2) + this._keyStr.charAt(enc3) + this._keyStr.charAt(enc4); } return output; }, // public method for decoding decode: function(input) { var output = ""; var chr1, chr2, chr3; var enc1, enc2, enc3, enc4; var i = 0; input = input.replace(/[^A-Za-z0-9\+\/\=]/g, ""); while (i < input.length) { enc1 = this._keyStr.indexOf(input.charAt(i++)); enc2 = this._keyStr.indexOf(input.charAt(i++)); enc3 = this._keyStr.indexOf(input.charAt(i++)); enc4 = this._keyStr.indexOf(input.charAt(i++)); chr1 = (enc1 << 2) | (enc2 >> 4); chr2 = ((enc2 & 15) << 4) | (enc3 >> 2); chr3 = ((enc3 & 3) << 6) | enc4; output = output + String.fromCharCode(chr1); if (enc3 != 64) { output = output + String.fromCharCode(chr2); } if (enc4 != 64) { output = output + String.fromCharCode(chr3); } } output = Base64._utf8_decode(output); return output; }, // private method for UTF-8 encoding _utf8_encode: function(string) { string = string.replace(/\r\n/g, "\n"); var utftext = ""; for (var n = 0; n < string.length; n++) { var c = string.charCodeAt(n); if (c < 128) { utftext += String.fromCharCode(c); } else if ((c > 127) && (c < 2048)) { utftext += String.fromCharCode((c >> 6) | 192); utftext += String.fromCharCode((c & 63) | 128); } else { utftext += String.fromCharCode((c >> 12) | 224); utftext += String.fromCharCode(((c >> 6) & 63) | 128); utftext += String.fromCharCode((c & 63) | 128); } } return utftext; }, // private method for UTF-8 decoding _utf8_decode: function(utftext) { var string = ""; var i = 0; var c = c1 = c2 = 0; while (i < utftext.length) { c = utftext.charCodeAt(i); if (c < 128) { string += String.fromCharCode(c); i++; } else if ((c > 191) && (c < 224)) { c2 = utftext.charCodeAt(i + 1); string += String.fromCharCode(((c & 31) << 6) | (c2 & 63)); i += 2; } else { c2 = utftext.charCodeAt(i + 1); c3 = utftext.charCodeAt(i + 2); string += String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); i += 3; } } return string; } };
这个是js版本,由于央行征信报告页面是拼接的,css取值不能一步到位,还必须再用正则细取,再者页面元素没有很好的标记,所以js版也是正则为主。js的match对应py的search,js的macth加g模式对应py的findall。js的search是返回个数字,要先弄清楚py和js的正则api的区别。
2、另外,使用js版本的用法是,要配合app的webview。在f12的console栏里面可以调试测试js,但那不是自动化的,尤其在多个页面跳转情况下,使用webview注入js非常方便。也可以直接在app端用httpclient对淘宝网发请求,但是征信类的项目,一般需要先登录,不依赖webview直接使用httpcliet请求淘宝登录接口的方式大批量登录任意明文的账号 密码,而不是复制ua cookie什么的(复制没什么鸟用,用户根本不知道cookie是什么,更没不用说ua参数是什么了),搞定它是天方夜谭,我没见过任何人搞定过,难度指数是五颗星。有兴趣的可以试试,不要只是嘴炮说抓包模拟就完了这么简单。