phantomjs + python 打造一个微信机器人

1.前奏

媳妇公司不能上网，但经常需要在公众号上找一些文章做一些参考，需要的时候就把文章链接分享给我，然后我在浏览器打开网页，一点点复制过来粘贴到word中，遇到图片更悲催，还得手动调整大小。是不是感觉自己的码农职称受到了挑战……，某一天看到一篇基于C#.NET的高端智能化网络爬虫（二）（攻破携程网），才发现了PhantomJS，由此解放了我复制粘贴的双手

2.介绍

PhantomJS

PhantomJS是一个基于webkit的JavaScript API。它使用QtWebKit作为它核心浏览器的功能，使用webkit来编译解释执行JavaScript代码。任何你可以在基于webkit浏览器做的事情，它都能做到。它不仅是个隐形的浏览器，提供了诸如CSS选择器、支持Web标准、DOM操作、JSON、HTML5、Canvas、SVG等，同时也提供了处理文件I/O的操作，从而使你可以向操作系统读写文件等。PhantomJS的用处可谓非常广泛，诸如网络监测、网页截屏、无需浏览器的 Web 测试、页面访问自动化等。。。

python

python当歌,人生几何。。。

itchat

itchat是一个开源的微信个人号接口，使用python调用微信从未如此简单。使用不到三十行的代码，你就可以完成一个能够处理所有信息的微信机器人。当然，该api的使用远不止一个机器人，更多的功能等着你来发现，比如这些。该接口与公众号接口itchatmp共享类似的操作方式，学习一次掌握两个工具。如今微信已经成为了个人社交的很大一部分，希望这个项目能够帮助你扩展你的个人的微信号、方便自己的生活。摘自官方文档

各种文档地址

　　PhantomJS官方地址：http://phantomjs.org/。
　　PhantomJS官方API：http://phantomjs.org/api/。
　　PhantomJS官方示例：http://phantomjs.org/examples/。
　　PhantomJS GitHub：https://github.com/ariya/phantomjs/。
　　python廖雪峰教程
　　图灵机器人

3.话不多说，代码为证

第一步利用PhantomJs将网页生成pdf的功能把需要的文章保存好

//创建webpage模块，用来请求并生成pdf
var page = require(\'webpage\').create();
//创建system模块，用来获取外部传递的参数
var system = require(\'system\');

//设置编码方式
phantom.outputEncoding = \'gb2312\';

page.onError = function (msg, trace) {
    console.log(msg);
    var msgStack = [\'PHANTOM ERROR: \' + msg];
    if (trace && trace.length) {
        msgStack.push(\'TRACE:\');
        trace.forEach(function (t) {
            msgStack.push(\' -> \' +
                (t.file || t.sourceURL) + \': \' + t.line +
                (t.function ? \' (in function \' + t.function + \')\' : \'\'));
        });
    }
    console.error(msgStack.join(\'\n\'));
    phantom.exit(1);
};

if (system.args.length == 1) {
    console.log(\'请输入文章地址\');
    phantom.exit();
} else {

    var url = system.args[1];
    var filename = \'\';
    if (system.args.length == 3) {
        filename = system.args[2];
    }

    //地址检测
    if (url.indexOf(\'http\') == -1) {
        url = \'http://\' + url;
    }

    page.viewportSize = { width: 600, height: 20 };//设置图片大小 height自动适应

    //===================pdf页面设置=====================
    page.settings.userAgent = \'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36\';
    page.zoomFactor = 1;//页面缩放比例
    page.settings.loadImages = true;//页面加载图片

    //format ：A4 纸，可以设置 "5in*7.5in", "10cm*20cm",  "Letter" 等
    //orientation ：纸方向是竖着的，或者 landscape
    //margin ：与纸四边间距，可自定义，也可详细设置 margin : { left: ‘0.8cm‘,  top : ‘0.8cm‘,  right : ‘0.8cm‘,  bottom : ‘0.8cm‘ }
    //设置页面格式
    //page.paperSize = { format: \'A4\', orientation: \'portrait\', margin: \'0.8cm\' };
    //===================pdf页面设置=====================

    page.open(url, function (status) {//加载页面
        console.log(\'抓取结果:\' + status);
        
        //利用page的evaluate获取网页的title
        var title = page.evaluate(function () {
            return document.title;
        });

        //去除特殊字符
        var title = title
            .replace(\'|\', \'\').replace(\'\\\', \'\').replace(\'/\', \'\').replace(\':\', \'\').replace(\'*\', \'\')
            .replace(\'?\', \'\').replace(\'"\', \'\').replace(\'<\', \'\').replace(\'>\', \'\');

        filename = filename.length > 0 ? filename : title;

        var wait = 200;
        //是否是微信地址
        if (url.indexOf("mp.weixin.qq.com") > -1) {
            
            //设置5秒是等待微信中图片完全加载出来，具体可调整
            wait = 5000;
            //加载微信图片
            //includeJs 侧重网络js文件，尤其在引入jQuery等第三方库
            //injectJs 侧重本地的js文件，与libraryPath挂购
            page.injectJs(\'replaceimage.js\', function () {
                var titlexx = page.evaluate(function () {
                    return document.title;
                });
            });
            console.log(title);
            console.log(\'等待5秒生成pdf\');
        }

        //等待页面执行完js后在进行生成
        window.setTimeout(function () {
            page.render(filename + \'.pdf\');
            console.log(\'pdf生成成功:\' + title);
            phantom.exit();
        }, wait);

    });
}

由于微信页面对请求refer有限制，页面中的图片是展现不出来的，replaceimage.js是将微信页面中的图片加载出来


window.onload = function change() {
    var metas = document.getElementsByTagName(\'meta\');
    metas[0].insertAdjacentHTML(\'beforeBegin\', "<meta name=\'referrer\' content=\'never\'>");
    var body = document.getElementById("activity-detail");

    //图片请求设置cookie
    body.insertAdjacentHTML("beforeBegin", "<image style=\'display: none\' src=\'http://mmbiz.qpic.cn/mmbiz_png/pmBoItic0ByggW4X5ACKS5rfIfB1VM7RIic0TA9no7a0pRFHLcBibJX8VAyxUw756hHibQccolNUjRbKviaT3QzpwJA/0?wx_fmt=png\' alt=\'bg\'/>");

    //替换img图片链接
    var imglist = body.getElementsByTagName(\'IMG\')
    for (i = 0; i < imglist.length; i++) {
        if (imglist[i].getAttribute(\'src\')!=null && imglist[i].getAttribute(\'src\').length > 0 && imglist[i].getAttribute(\'src\').indexOf("mmbiz.qpic.cn") > -1) {
            imglist[i].setAttribute(\'src\', "http://read.html5.qq.com/image?src=forum&q=5&r=0&imgflag=7&imageUrl=" + imglist[i].getAttribute(\'src\'));
        }

        if (imglist[i].getAttribute(\'data-src\')!=null && imglist[i].getAttribute(\'data-src\').length > 0 && imglist[i].getAttribute(\'data-src\').indexOf("mmbiz.qpic.cn") > -1) {
            imglist[i].setAttribute(\'src\', "http://read.html5.qq.com/image?src=forum&q=5&r=0&imgflag=7&imageUrl=" + imglist[i].getAttribute(\'data-src\'));
        }
		if (imglist[i].getAttribute(\'data-s\')!=null && imglist[i].getAttribute(\'data-s\').length > 0) {
			var w=imglist[i].getAttribute(\'data-s\').split(",")[0];
			var h=imglist[i].getAttribute(\'data-s\').split(",")[1];
            imglist[i].setAttribute(\'width\',h);
			imglist[i].setAttribute(\'height\',w);
        }
      
    }

    //替换背景图片
    var sectionlist = document.querySelectorAll(\'section\')
    for (j = 0; j < sectionlist.length; j++) {
        var newhtml = sectionlist[j].style.backgroundImage.replace("http://mmbiz.qpic.cn", "http://read.html5.qq.com/image?src=forum&q=5&r=0&imgflag=7&imageUrl=http://mmbiz.qpic.cn");
        sectionlist[j].style.backgroundImage = newhtml;
    }
}

执行命令,phantomjs xx.js "http://mp.weixin.qq.com/s/zFdPBDJcGsTbQgKGogLzyw"

第二步利用Python结合itchat接收消息，调用phantomjs生成pdf

#-*-coding:utf-8-*- 

import itchat,time
from itchat.content import *
import requests,json,sys
import hashlib
import HTMLParser 
import re 
import os
from sys import argv

#接收文本消息，提取网址
@itchat.msg_register([TEXT])
def text_reply(msg):

    # 将正则表达式编译成Pattern对象
    pattern = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.IGNORECASE)

    # 使用Pattern匹配文本，获得匹配结果，无法匹配时将返回None
    match = pattern.match(msg[\'Text\'])

    if match:
        # 使用Match获得分组信息
        print match.group()
        filename=int(time.time())
        urltopdf(msg[\'FromUserName\'],match.group(),str(filename))

#获取分享的文章地址
@itchat.msg_register([SHARING])
def sharingtopdf(msg):
    filename=u\'\'+\'"\'+msg[\'Text\']+\'"\'
    urltopdf(msg[\'FromUserName\'],msg[\'Url\'],filename)

@itchat.msg_register(PICTURE,RECORDING,ATTACHMENT,VIDEO)
def download_files(msg):
    msg[\'Text\'](msg[\'FileName\'])
    return \'@%s@%s\' % ({\'Picture\': \'img\', \'Video\': \'vid\'}.get(msg[\'Type\'], \'fil\'), msg[\'FileName\'])

@itchat.msg_register(FRIENDS)
def add_friend(msg):
    itchat.add_friend(**msg[\'Text\'])
    itchat.send_msg(\'Nice to meet you!\',msg[\'RecommentInfo\'][\'UserName\'])

@itchat.msg_register(TEXT,isGroupChat=True)
def text_reply(msg):
    if msg[\'isAt\']:
        itchat.send(u\'@%s\u2005I received: %s\' % (msg[\'ActualNickName\'], msg[\'Content\']), msg[\'FromUserName\'])

def urltopdf(touser,url,filename):

    #切换到phantomjs目录
    os.chdir(\'D:\sourcecode\htmlsource\phantomjs\')

    #文件名
    filename= filename.replace(\'|\', \'\').replace(\'\\\', \'\').replace(\'/\', \'\').replace(\':\', \'\').replace(\'*\', \'\').replace(\'?\', \'\').replace(\'"\', \'\').replace(\'<\', \'\').replace(\'>\', \'\')

    itchat.send(u\'正在生成pdf,请稍等\',touser)
    
    #解码url
    html_parser = HTMLParser.HTMLParser()
    article_url = html_parser.unescape(url)

    #执行
    #python中文编码问题解决方式
    cmd_method="phantomjs loadpage.js %s %s" % (\'"\'+url+\'"\',filename)
    os.system(cmd_method.encode(\'gb2312\'))
    
    #发送
    all_file_path="D:\\sourcecode\\htmlsource\\phantomjs\\%s.pdf" % (filename)
    
    #py2.7版本需要下载fields.py 并覆盖
    #安装目录下 requests\packages\urllib3
    itchat.send_file(all_file_path,touser)

itchat.auto_login(enableCmdQR=True,hotReload=True)
itchat.run()