====== Python ====== 一个twister爬虫可通过python轻易实现,如下面的''contrib/usernameCrawler.py''。 一旦我们有了一个整合的数据,它将变得可轻易搜索。事实上,任何的网络节点都可以通过技术手段为twister用户托管和提供此类服务。(如twisterio.com) ===== usernameCrawler.py ===== #!/usr/bin/python # # This sample script is a username crawler: it will obtain all known usernames # from block chain and then try to download avatar and profiles for all of # them. The report is shown as an html file. # # Downloaded data is cached in a python pickle file, so it may be executed # again and it won't need to get everything all over again (you may run it # from cron scripts, for example) import sys, cPickle, time dbFileName = "usernameCrawler.pickle" htmlFileName = "userlist.html" cacheTimeout = 24*3600 try: from bitcoinrpc.authproxy import AuthServiceProxy except ImportError as exc: sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n") exit(-1) serverUrl = "http://user:pwd@127.0.0.1:28332" if len(sys.argv) > 1: serverUrl = sys.argv[1] twister = AuthServiceProxy(serverUrl) class User: avatar = "" fullname = "" location = "" updateTime = 0 class MyDb: lastBlockHash = 0 try: db = cPickle.load(open(dbFileName)) nextHash = db.lastBlockHash except: db = MyDb() db.usernames = {} nextHash = twister.getblockhash(0) while True: block = twister.getblock(nextHash) db.lastBlockHash = block["hash"] print str(block["height"]) + "\r", usernames = block["usernames"] for u in usernames: if not db.usernames.has_key(u): db.usernames[u] = User() if block.has_key("nextblockhash"): nextHash = block["nextblockhash"] else: break now = time.time() for u in db.usernames.keys(): if db.usernames[u].updateTime + cacheTimeout < now: print "getting avatar for", u, "..." d = twister.dhtget(u,"avatar","s") if len(d) == 1 and d[0].has_key("p") and d[0]["p"].has_key("v"): db.usernames[u].avatar = d[0]["p"]["v"] print "getting profile for", u, "..." d = twister.dhtget(u,"profile","s") if len(d) == 1 and d[0].has_key("p") and d[0]["p"].has_key("v"): db.usernames[u].fullname = d[0]["p"]["v"]["fullname"] db.usernames[u].location = d[0]["p"]["v"]["location"] db.usernames[u].updateTime = now cPickle.dump(db,open(dbFileName,"w")) from HTML import HTML from cgi import escape def outputHtmlUserlist(fname, db, keys): h = HTML() head = h.head("") with h.body(""): with h.table(border='1', newlines=True): with h.colgroup: h.col(span="1", style="width: 64px;") h.col(span="1", style="width: 130px;") h.col(span="1", style="width: 250px;") h.col(span="1", style="width: 250px;") with h.tr: h.th("avatar") h.th("username") h.th("fullname") h.th("location") for u in keys: with h.tr: with h.td(): h.img('',src=escape(db.usernames[u].avatar), width="64", height="64") h.td(u) h.td(escape(db.usernames[u].fullname)) h.td(escape(db.usernames[u].location)) open(fname, "w").write(str(h)) print "Generating", htmlFileName, "..." keys = db.usernames.keys() keys.sort() # sorted by username outputHtmlUserlist(htmlFileName, db, keys) ===== posts_sync.py ===== 从html页面发贴 #!/usr/bin/python # # posts_sync.py example script to post from html page import sys, cPickle, time, urllib2 from pyquery import PyQuery reload(sys) sys.setdefaultencoding("utf-8") try: from bitcoinrpc.authproxy import AuthServiceProxy except ImportError as exc: sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n") exit(-1) ### options parsing from optparse import OptionParser parser = OptionParser("usage: %prog [options] ") parser.add_option("-s", "--serverUrl", action="store", dest="serverUrl", default="http://user:pwd@127.0.0.1:28332", help="connect to specified twisterd server URL") parser.add_option("-p", "--proxyUrl", action="store", dest="proxyUrl", default="", help="proxyUrl to use") parser.add_option("-d", action="store_true", dest="dryRun", help="dry-run, just report posts") (options, args) = parser.parse_args() if len(args) != 2: parser.error("incorrect number of arguments") pageUrl = args[0] username = args[1] ### connect to twisterd twister = AuthServiceProxy(options.serverUrl) lastK = -1 lastUserPost = twister.getposts(1, [{"username":username}]) for i in range(len(lastUserPost)): if lastUserPost[i]["userpost"]["n"] == username: lastK = int(lastUserPost[i]["userpost"]["k"]) break print username, "lastK:", lastK ### load db from previous run dbFileName = username + ".pickle" class MyDb: lastDatatime = 0 try: db = cPickle.load(open(dbFileName)) except: db = MyDb() ### setup proxy if len(options.proxyUrl): proxy = urllib2.ProxyHandler({'http': options.proxyUrl,'https': options.proxyUrl}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) ### download html content user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11' headers = { 'User-Agent' : user_agent } req = urllib2.Request(pageUrl, headers = headers) response = urllib2.urlopen(req) html = response.read() pq = PyQuery(html.decode('utf8')) ### parse html items = pq(".content") for i in xrange(len(items)-1,0,-1): item = items.eq(i) datatime = int(item.find("[data-time]").attr("data-time")) if datatime > db.lastDatatime : db.lastDatatime = datatime p = item.find("p") ptext = p.text() ptext = ptext.replace(":// ","://").replace("# ","#").replace("@ ","@") print "newpostmsg", username, lastK+1, ptext if not options.dryRun: try: twister.newpostmsg(username, lastK+1, ptext) except: pass lastK = lastK+1 if not options.dryRun: cPickle.dump(db,open(dbFileName,"w"))