目录

Python

一个twister爬虫可通过python轻易实现,如下面的contrib/usernameCrawler.py

一旦我们有了一个整合的数据,它将变得可轻易搜索。事实上,任何的网络节点都可以通过技术手段为twister用户托管和提供此类服务。(如twisterio.com)

usernameCrawler.py

usernameCrawler.py
#!/usr/bin/python
#
# This sample script is a username crawler: it will obtain all known usernames
# from block chain and then try to download avatar and profiles for all of
# them. The report is shown as an html file.
#
# Downloaded data is cached in a python pickle file, so it may be executed
# again and it won't need to get everything all over again (you may run it
# from cron scripts, for example)
 
import sys, cPickle, time
 
dbFileName = "usernameCrawler.pickle"
htmlFileName = "userlist.html"
cacheTimeout = 24*3600
 
try:
    from bitcoinrpc.authproxy import AuthServiceProxy
except ImportError as exc:
    sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n")
    exit(-1)
 
serverUrl = "http://user:pwd@127.0.0.1:28332"
if len(sys.argv) > 1:
    serverUrl = sys.argv[1]
 
twister = AuthServiceProxy(serverUrl)
 
class User:
    avatar = ""
    fullname = ""
    location = ""
    updateTime = 0
 
class MyDb:
    lastBlockHash = 0
 
try:
    db = cPickle.load(open(dbFileName))
    nextHash = db.lastBlockHash
except:
    db = MyDb()
    db.usernames = {}
    nextHash = twister.getblockhash(0)
 
while True:
    block = twister.getblock(nextHash)
    db.lastBlockHash = block["hash"]
    print str(block["height"]) + "\r",
    usernames = block["usernames"]
    for u in usernames:
        if not db.usernames.has_key(u):
            db.usernames[u] = User()
    if block.has_key("nextblockhash"):
        nextHash = block["nextblockhash"]
    else:
        break
 
now = time.time()
for u in db.usernames.keys():
    if db.usernames[u].updateTime + cacheTimeout < now:
 
        print "getting avatar for", u, "..."
        d = twister.dhtget(u,"avatar","s")
        if len(d) == 1 and d[0].has_key("p") and d[0]["p"].has_key("v"):
            db.usernames[u].avatar = d[0]["p"]["v"]
 
        print "getting profile for", u, "..."
        d = twister.dhtget(u,"profile","s")
        if len(d) == 1 and d[0].has_key("p") and d[0]["p"].has_key("v"):
            db.usernames[u].fullname = d[0]["p"]["v"]["fullname"]
            db.usernames[u].location = d[0]["p"]["v"]["location"]
 
        db.usernames[u].updateTime = now
 
cPickle.dump(db,open(dbFileName,"w"))
 
 
from HTML import HTML
from cgi import escape
def outputHtmlUserlist(fname, db, keys):
    h = HTML()
    head = h.head("")
    with h.body(""):
        with h.table(border='1', newlines=True):
            with h.colgroup:
                h.col(span="1", style="width: 64px;")
                h.col(span="1", style="width: 130px;")
                h.col(span="1", style="width: 250px;")
                h.col(span="1", style="width: 250px;")
            with h.tr:
                h.th("avatar")
                h.th("username")
                h.th("fullname")
                h.th("location")
            for u in keys:
                with h.tr:
                    with h.td():
                        h.img('',src=escape(db.usernames[u].avatar), width="64", height="64")
                    h.td(u)
                    h.td(escape(db.usernames[u].fullname))
                    h.td(escape(db.usernames[u].location))
    open(fname, "w").write(str(h))
 
print "Generating", htmlFileName, "..."
 
keys = db.usernames.keys()
keys.sort() # sorted by username
outputHtmlUserlist(htmlFileName, db, keys)

posts_sync.py

从html页面发贴

posts_sync.py
#!/usr/bin/python
#
# posts_sync.py example script to post from html page
 
import sys, cPickle, time, urllib2
from pyquery import PyQuery
 
reload(sys)
sys.setdefaultencoding("utf-8")
 
try:
    from bitcoinrpc.authproxy import AuthServiceProxy
except ImportError as exc:
    sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n")
    exit(-1)
 
### options parsing
 
from optparse import OptionParser
parser = OptionParser("usage: %prog [options] <page_url> <username>")
parser.add_option("-s", "--serverUrl",
                  action="store", dest="serverUrl", default="http://user:pwd@127.0.0.1:28332",
                  help="connect to specified twisterd server URL")
parser.add_option("-p", "--proxyUrl",
                  action="store", dest="proxyUrl", default="",
                  help="proxyUrl to use")
parser.add_option("-d", action="store_true", dest="dryRun",
                  help="dry-run, just report posts")
 
(options, args) = parser.parse_args()
if len(args) != 2:
    parser.error("incorrect number of arguments")
 
pageUrl = args[0]
username = args[1]
 
### connect to twisterd
 
twister = AuthServiceProxy(options.serverUrl)
lastK = -1
lastUserPost = twister.getposts(1, [{"username":username}])
for i in range(len(lastUserPost)):
    if lastUserPost[i]["userpost"]["n"] == username:
        lastK = int(lastUserPost[i]["userpost"]["k"])
        break
print username, "lastK:", lastK
 
### load db from previous run
 
dbFileName = username + ".pickle"
class MyDb:
    lastDatatime = 0
try:
    db = cPickle.load(open(dbFileName))
except:
    db = MyDb()
 
### setup proxy
 
if len(options.proxyUrl):
    proxy = urllib2.ProxyHandler({'http': options.proxyUrl,'https': options.proxyUrl})
    opener = urllib2.build_opener(proxy)
    urllib2.install_opener(opener)
 
### download html content
 
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(pageUrl, headers = headers)
response = urllib2.urlopen(req)
html = response.read()
pq = PyQuery(html.decode('utf8'))
 
### parse html
 
items = pq(".content")
for i in xrange(len(items)-1,0,-1):
    item = items.eq(i)
    datatime = int(item.find("[data-time]").attr("data-time"))
    if datatime > db.lastDatatime :
        db.lastDatatime = datatime
        p = item.find("p")
        ptext = p.text()
        ptext = ptext.replace(":// ","://").replace("# ","#").replace("@ ","@")
        print "newpostmsg", username, lastK+1, ptext
        if not options.dryRun:
        try:
            twister.newpostmsg(username, lastK+1, ptext)
        except:
            pass
        lastK = lastK+1
 
if not options.dryRun:
    cPickle.dump(db,open(dbFileName,"w"))