这里会显示出您选择的修订版和当前版本之间的差别。
— |
zh:开发:脚本示例:python [2014/09/28 02:26] (当前版本) xiaolan 创建 |
||
---|---|---|---|
行 1: | 行 1: | ||
+ | ====== Python ====== | ||
+ | |||
+ | 一个twister爬虫可通过python轻易实现,如下面的''contrib/usernameCrawler.py''。 | ||
+ | |||
+ | 一旦我们有了一个整合的数据,它将变得可轻易搜索。事实上,任何的网络节点都可以通过技术手段为twister用户托管和提供此类服务。(如twisterio.com) | ||
+ | ===== usernameCrawler.py ===== | ||
+ | |||
+ | <file python usernameCrawler.py> | ||
+ | |||
+ | #!/usr/bin/python | ||
+ | # | ||
+ | # This sample script is a username crawler: it will obtain all known usernames | ||
+ | # from block chain and then try to download avatar and profiles for all of | ||
+ | # them. The report is shown as an html file. | ||
+ | # | ||
+ | # Downloaded data is cached in a python pickle file, so it may be executed | ||
+ | # again and it won't need to get everything all over again (you may run it | ||
+ | # from cron scripts, for example) | ||
+ | |||
+ | import sys, cPickle, time | ||
+ | |||
+ | dbFileName = "usernameCrawler.pickle" | ||
+ | htmlFileName = "userlist.html" | ||
+ | cacheTimeout = 24*3600 | ||
+ | |||
+ | try: | ||
+ | from bitcoinrpc.authproxy import AuthServiceProxy | ||
+ | except ImportError as exc: | ||
+ | sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n") | ||
+ | exit(-1) | ||
+ | |||
+ | serverUrl = "http://user:pwd@127.0.0.1:28332" | ||
+ | if len(sys.argv) > 1: | ||
+ | serverUrl = sys.argv[1] | ||
+ | |||
+ | twister = AuthServiceProxy(serverUrl) | ||
+ | |||
+ | class User: | ||
+ | avatar = "" | ||
+ | fullname = "" | ||
+ | location = "" | ||
+ | updateTime = 0 | ||
+ | |||
+ | class MyDb: | ||
+ | lastBlockHash = 0 | ||
+ | |||
+ | try: | ||
+ | db = cPickle.load(open(dbFileName)) | ||
+ | nextHash = db.lastBlockHash | ||
+ | except: | ||
+ | db = MyDb() | ||
+ | db.usernames = {} | ||
+ | nextHash = twister.getblockhash(0) | ||
+ | |||
+ | while True: | ||
+ | block = twister.getblock(nextHash) | ||
+ | db.lastBlockHash = block["hash"] | ||
+ | print str(block["height"]) + "\r", | ||
+ | usernames = block["usernames"] | ||
+ | for u in usernames: | ||
+ | if not db.usernames.has_key(u): | ||
+ | db.usernames[u] = User() | ||
+ | if block.has_key("nextblockhash"): | ||
+ | nextHash = block["nextblockhash"] | ||
+ | else: | ||
+ | break | ||
+ | |||
+ | now = time.time() | ||
+ | for u in db.usernames.keys(): | ||
+ | if db.usernames[u].updateTime + cacheTimeout < now: | ||
+ | |||
+ | print "getting avatar for", u, "..." | ||
+ | d = twister.dhtget(u,"avatar","s") | ||
+ | if len(d) == 1 and d[0].has_key("p") and d[0]["p"].has_key("v"): | ||
+ | db.usernames[u].avatar = d[0]["p"]["v"] | ||
+ | |||
+ | print "getting profile for", u, "..." | ||
+ | d = twister.dhtget(u,"profile","s") | ||
+ | if len(d) == 1 and d[0].has_key("p") and d[0]["p"].has_key("v"): | ||
+ | db.usernames[u].fullname = d[0]["p"]["v"]["fullname"] | ||
+ | db.usernames[u].location = d[0]["p"]["v"]["location"] | ||
+ | |||
+ | db.usernames[u].updateTime = now | ||
+ | |||
+ | cPickle.dump(db,open(dbFileName,"w")) | ||
+ | |||
+ | |||
+ | from HTML import HTML | ||
+ | from cgi import escape | ||
+ | def outputHtmlUserlist(fname, db, keys): | ||
+ | h = HTML() | ||
+ | head = h.head("") | ||
+ | with h.body(""): | ||
+ | with h.table(border='1', newlines=True): | ||
+ | with h.colgroup: | ||
+ | h.col(span="1", style="width: 64px;") | ||
+ | h.col(span="1", style="width: 130px;") | ||
+ | h.col(span="1", style="width: 250px;") | ||
+ | h.col(span="1", style="width: 250px;") | ||
+ | with h.tr: | ||
+ | h.th("avatar") | ||
+ | h.th("username") | ||
+ | h.th("fullname") | ||
+ | h.th("location") | ||
+ | for u in keys: | ||
+ | with h.tr: | ||
+ | with h.td(): | ||
+ | h.img('',src=escape(db.usernames[u].avatar), width="64", height="64") | ||
+ | h.td(u) | ||
+ | h.td(escape(db.usernames[u].fullname)) | ||
+ | h.td(escape(db.usernames[u].location)) | ||
+ | open(fname, "w").write(str(h)) | ||
+ | |||
+ | print "Generating", htmlFileName, "..." | ||
+ | |||
+ | keys = db.usernames.keys() | ||
+ | keys.sort() # sorted by username | ||
+ | outputHtmlUserlist(htmlFileName, db, keys) | ||
+ | |||
+ | </file> | ||
+ | |||
+ | ===== posts_sync.py ===== | ||
+ | |||
+ | 从html页面发贴 | ||
+ | |||
+ | <file python posts_sync.py> | ||
+ | #!/usr/bin/python | ||
+ | # | ||
+ | # posts_sync.py example script to post from html page | ||
+ | |||
+ | import sys, cPickle, time, urllib2 | ||
+ | from pyquery import PyQuery | ||
+ | |||
+ | reload(sys) | ||
+ | sys.setdefaultencoding("utf-8") | ||
+ | |||
+ | try: | ||
+ | from bitcoinrpc.authproxy import AuthServiceProxy | ||
+ | except ImportError as exc: | ||
+ | sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n") | ||
+ | exit(-1) | ||
+ | |||
+ | ### options parsing | ||
+ | |||
+ | from optparse import OptionParser | ||
+ | parser = OptionParser("usage: %prog [options] <page_url> <username>") | ||
+ | parser.add_option("-s", "--serverUrl", | ||
+ | action="store", dest="serverUrl", default="http://user:pwd@127.0.0.1:28332", | ||
+ | help="connect to specified twisterd server URL") | ||
+ | parser.add_option("-p", "--proxyUrl", | ||
+ | action="store", dest="proxyUrl", default="", | ||
+ | help="proxyUrl to use") | ||
+ | parser.add_option("-d", action="store_true", dest="dryRun", | ||
+ | help="dry-run, just report posts") | ||
+ | |||
+ | (options, args) = parser.parse_args() | ||
+ | if len(args) != 2: | ||
+ | parser.error("incorrect number of arguments") | ||
+ | |||
+ | pageUrl = args[0] | ||
+ | username = args[1] | ||
+ | |||
+ | ### connect to twisterd | ||
+ | |||
+ | twister = AuthServiceProxy(options.serverUrl) | ||
+ | lastK = -1 | ||
+ | lastUserPost = twister.getposts(1, [{"username":username}]) | ||
+ | for i in range(len(lastUserPost)): | ||
+ | if lastUserPost[i]["userpost"]["n"] == username: | ||
+ | lastK = int(lastUserPost[i]["userpost"]["k"]) | ||
+ | break | ||
+ | print username, "lastK:", lastK | ||
+ | |||
+ | ### load db from previous run | ||
+ | |||
+ | dbFileName = username + ".pickle" | ||
+ | class MyDb: | ||
+ | lastDatatime = 0 | ||
+ | try: | ||
+ | db = cPickle.load(open(dbFileName)) | ||
+ | except: | ||
+ | db = MyDb() | ||
+ | |||
+ | ### setup proxy | ||
+ | |||
+ | if len(options.proxyUrl): | ||
+ | proxy = urllib2.ProxyHandler({'http': options.proxyUrl,'https': options.proxyUrl}) | ||
+ | opener = urllib2.build_opener(proxy) | ||
+ | urllib2.install_opener(opener) | ||
+ | |||
+ | ### download html content | ||
+ | |||
+ | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11' | ||
+ | headers = { 'User-Agent' : user_agent } | ||
+ | req = urllib2.Request(pageUrl, headers = headers) | ||
+ | response = urllib2.urlopen(req) | ||
+ | html = response.read() | ||
+ | pq = PyQuery(html.decode('utf8')) | ||
+ | |||
+ | ### parse html | ||
+ | |||
+ | items = pq(".content") | ||
+ | for i in xrange(len(items)-1,0,-1): | ||
+ | item = items.eq(i) | ||
+ | datatime = int(item.find("[data-time]").attr("data-time")) | ||
+ | if datatime > db.lastDatatime : | ||
+ | db.lastDatatime = datatime | ||
+ | p = item.find("p") | ||
+ | ptext = p.text() | ||
+ | ptext = ptext.replace(":// ","://").replace("# ","#").replace("@ ","@") | ||
+ | print "newpostmsg", username, lastK+1, ptext | ||
+ | if not options.dryRun: | ||
+ | try: | ||
+ | twister.newpostmsg(username, lastK+1, ptext) | ||
+ | except: | ||
+ | pass | ||
+ | lastK = lastK+1 | ||
+ | |||
+ | if not options.dryRun: | ||
+ | cPickle.dump(db,open(dbFileName,"w")) | ||
+ | </file> |