什么是PyV8?
PyV8是一个Python封装V8引擎的壳。它提供了简单可用的API,能够利用python来构建出JavaScript的运行时环境。
PyV8能用来干什么?
在nodejs火热流行的时代,或许很少人关注这个基于python简单封装的v8引擎。在某些方面,它比nodejs简洁,而它们拥有同样的本质基础,使得它具有和nodejs相似的潜力。
既然是基于v8的,那么利用它来解析dom和执行javascript是理所当然的。试想一下,如果我们能够建立一个系统把html文档结构解析成w3c dom树,那么我们就可以在上面运行任何javascript,使得这个系统事实上成为一个浏览器(尽管缺少渲染和显示的部分)。
不过,pyv8提供的是纯js运行环境,因此,要解析html文档,还需要为它构建出w3c dom以及浏览器的环境。实现dom和浏览器接口毫无疑问是件力气活,幸运的是,pyv8的官方demo中为我们做了80%以上的工作,提供了w3c.py和browser.py的支持!我们所要做的是在这基础上做一些基本的修改。
好吧,让我们来实践一下:
首先我们构建一个基本的js运行环境,参考commonjs的api实现js基础的模块加载和执行。这一部分还不涉及dom和浏览器,基本上是js原生环境。
- import os, re, platform
- from PyV8 import JSContext, JSError
- from logger import logger
- class CommonJS():
- _js_path = [os.path.dirname(__file__), os.path.join(os.path.dirname(__file__),'core')]
- _js_logger = logger().instance()
- def __init__(self):
- self._js_threadlock = False
- self._js_ctx = JSContext(self)
- self._js_modules = {}
- self._loaded_modules = {}
- for jsroot in CommonJS._js_path:
- for (root, dirs, files) in os.walk(jsroot):
- for _file in files:
- m = re.compile('(.*).js$').match(_file)
- relpath = os.path.relpath(root, jsroot)
- namespace = re.sub(r'^.', '', relpath)
- namespace = re.sub(r'^\', '/', namespace)
- if(namespace):
- namespace = namespace + '/'
- if(m):
- self._js_modules.update({namespace + m.group(1) : os.path.join(root,_file)})
- self.execute("var exports;");
- @classmethod
- def append(path):
- if(path not in CommonJS._js_path):
- CommonJS._js_path.append(path)
- def require(self, module):
- if(not self._js_modules.has_key(module)):
- raise Exception, "unknown module `" + module + "`"
- path = self._js_modules[module]
- if(not self._loaded_modules.has_key(path)):
- self._js_logger.info("loading module <%s>...", module)
- code = file(path).read()
- try:
- code = code.decode('utf-8')
- if(platform.system() == 'Windows'):
- code = code.encode('utf-8')
- self._js_ctx.eval(code)
- except JSError, ex:
- self._js_logger.error(ex)
- self._js_logger.debug(ex.stackTrace)
- raise Exception, ex
- self._loaded_modules[path] = self._js_ctx.locals.exports
- return self._loaded_modules[path]
- else:
- return self._loaded_modules[path]
- def execute(self, code, args = []):
- self._js_ctx.enter()
- # use lock while jscode executing to make mutil-thread work
- while self._js_threadlock:
- pass
- self._js_threadlock = True
- try:
- if(isinstance(code, basestring)):
- code = code.decode('utf-8')
- if(platform.system() == 'Windows'):
- code = code.encode('utf-8')
- r = self._js_ctx.eval(code)
- else:
- r = apply(code, args)
- return r
- except JSError, ex:
- self._js_logger.error(ex)
- self._js_logger.debug(ex.stackTrace)
- raise Exception, ex
- finally:
- self._js_threadlock = False
- self._js_ctx.leave()
好了,我们可以测试一下:
- from commonjs import CommonJS
- ctx = CommonJS()
- ctx.append('/my/js/path')
- print ctx.execute('require("base"); QW.provide("Test",{}); exports = QW.Test');
结果是
[JSObject]
在这基础上,我们进一步实现整个运行时环境和一些内置方法:
- #JavaScript HTML Context for PyV8
- from PyV8 import JSClass
- from logger import logger
- import re, threading, hashlib
- import urllib,urllib2
- from w3c import parseString, Document, HTMLElement
- from commonjs import CommonJS
- import browser
- from StringIO import StringIO
- import gzip
- class JSR(CommonJS, browser.HtmlWindow):
- def __init__(self, url_or_dom, charset=None, headers={}, body={}, timeout=2):
- urllib2.socket.setdefaulttimeout(timeout)
- jsonp = False
- if(isinstance(url_or_dom, Document)):
- url = "localhost:document"
- dom = url_or_dom
- elif(url_or_dom.startswith('<')):
- url = "localhost:string"
- dom = parseString(url_or_dom)
- else: #url
- url = url_or_dom
- if(not re.match(r'w+://', url)):
- url = "http://" + url
- request = urllib2.Request(url, urllib.urlencode(body), headers=headers)
- response = urllib2.urlopen(url)
- contentType = response.headers.get('Content-Type')
- if(contentType):
- #print contentType
- t = re.search(r'x-javascript|json', contentType)
- if(t):
- jsonp = True
- m = re.match(r'^.*;s*charset=(.*)$', contentType)
- if(m):
- charset = m.group(1)
- #print charset
- if(not charset):
- charset = 'utf-8' #default charset
- # guess charset from httpheader
- html = response.read()
- encoding = response.headers.get('Content-Encoding')
- if(encoding and encoding == 'gzip'):
- buf = StringIO(html)
- f = gzip.GzipFile(fileobj=buf)
- html = f.read()
- self.__html__ = html
- html = unicode(html, encoding=charset, errors='ignore')
- dom = parseString(html)
- navigator = browser.matchNavigator(headers.get('User-Agent') or '')
- browser.HtmlWindow.__init__(self, url, dom, navigator)
- CommonJS.__init__(self)
- self.console = JSConsole(self._js_logger)
- for module in "base, array.h, function.h, helper.h, object.h, string.h, date.h, custevent, selector, dom_retouch".split(","):
- self.execute(self.require, [module.strip()])
- if(jsonp):
- code = "window.data=" + html.encode('utf-8')
- self.execute(code)
- #print code
- self._js_logger.info('JavaScript runtime ready.')
- _js_timer_map = {}
- def _js_execTimer(self, id, callback, delay, repeat = False):
- code = '(function f(){ _js_timers[' + str(id) + '][1].code();'
- if(repeat):
- code = code + '_js_execTimer(' + str(id) + ', f, ' + str(delay) + ', true);'
- code = code + '})();'
- #thread locking
- self._js_timer_map[id] = threading.Timer(delay / 1000.0, lambda: self.execute(code))
- self._js_timer_map[id].start()
- def setTimeout(self, callback, delay):
- timerId = super(JSR, self).setTimeout(callback, delay)
- self._js_execTimer(timerId, callback, delay, False)
- return timerId
- def clearTimeout(self, timerId):
- if(timerId in self._js_timer_map):
- self._js_timer_map[timerId].cancel()
- self._js_timer_map[timerId] = None
- super(JSR, self).clearTimeout(timerId)
- def setInterval(self, callback, delay):
- timerId = super(JSR, self).setInterval(callback, delay)
- self._js_execTimer(timerId, callback, delay, True)
- return timerId
- def clearInterval(self, timerId):
- if(timerId in self._js_timer_map):
- self._js_timer_map[timerId].cancel()
- self._js_timer_map[timerId] = None
- super(JSR, self).clearTimeout(timerId)
- def md5(self, str):
- return hashlib.md5(str).hexdigest()
- class JSConsole(JSClass):
- def __init__(self, logger):
- self._js_logger = logger
- def log(self, msg):
- self._js_logger.info(str(msg).decode('utf-8'))
到此为止,一个简单的运行时环境就实现了。
使用方法:
- from jsruntime import JSR
- rt = JSR('www.baidu.com')
- rt.execute('alert(document.title)') #结果是“百度一下,你就知道”
- rt.execute('document.querySelector("body div")') #获得body下匹配的第一个div
- ...
利用它,你就可以用js来处理抓取的页面元素,甚至通过运行页面上的js来获取动态加载的内容,只要继续扩展,可以用它来实现一个相当不错的页面内容采集服务,而采集规则,不再只能是简单的正则,而是可以用js像处理任何动态网页那样来处理抓取的页面。
有兴趣滴童鞋可以研究下,pyv8的潜力还是很大的,大家一起期待一下~
网友评论已有0条评论, 我也要评论