twister

Peer-to-peer microblogging

用户工具

站点工具


zh:开发:脚本示例:python

差别

这里会显示出您选择的修订版和当前版本之间的差别。

到此差别页面的链接

zh:开发:脚本示例:python [2014/09/28 02:26] (当前版本)
xiaolan 创建
行 1: 行 1:
  
 +====== Python ======
 +
 +一个twister爬虫可通过python轻易实现,如下面的''​contrib/​usernameCrawler.py''​。
 +
 +一旦我们有了一个整合的数据,它将变得可轻易搜索。事实上,任何的网络节点都可以通过技术手段为twister用户托管和提供此类服务。(如twisterio.com)
 +===== usernameCrawler.py =====
 +
 +<file python usernameCrawler.py>​
 +
 +#​!/​usr/​bin/​python
 +#
 +# This sample script is a username crawler: it will obtain all known usernames
 +# from block chain and then try to download avatar and profiles for all of
 +# them. The report is shown as an html file.
 +#
 +# Downloaded data is cached in a python pickle file, so it may be executed
 +# again and it won't need to get everything all over again (you may run it
 +# from cron scripts, for example)
 +
 +import sys, cPickle, time
 +
 +dbFileName = "​usernameCrawler.pickle"​
 +htmlFileName = "​userlist.html"​
 +cacheTimeout = 24*3600
 +
 +try:
 +    from bitcoinrpc.authproxy import AuthServiceProxy
 +except ImportError as exc:
 +    sys.stderr.write("​Error:​ install python-bitcoinrpc (https://​github.com/​jgarzik/​python-bitcoinrpc)\n"​)
 +    exit(-1)
 +
 +serverUrl = "​http://​user:​pwd@127.0.0.1:​28332"​
 +if len(sys.argv) > 1:
 +    serverUrl = sys.argv[1]
 +
 +twister = AuthServiceProxy(serverUrl)
 +
 +class User:
 +    avatar = ""​
 +    fullname = ""​
 +    location = ""​
 +    updateTime = 0
 +
 +class MyDb:
 +    lastBlockHash = 0
 +
 +try:
 +    db = cPickle.load(open(dbFileName))
 +    nextHash = db.lastBlockHash
 +except:
 +    db = MyDb()
 +    db.usernames = {}
 +    nextHash = twister.getblockhash(0)
 +
 +while True:
 +    block = twister.getblock(nextHash)
 +    db.lastBlockHash = block["​hash"​]
 +    print str(block["​height"​]) + "​\r",​
 +    usernames = block["​usernames"​]
 +    for u in usernames:
 +        if not db.usernames.has_key(u):​
 +            db.usernames[u] = User()
 +    if block.has_key("​nextblockhash"​):​
 +        nextHash = block["​nextblockhash"​]
 +    else:
 +        break
 +
 +now = time.time()
 +for u in db.usernames.keys():​
 +    if db.usernames[u].updateTime + cacheTimeout < now:
 +
 +        print "​getting avatar for", u, "​..."​
 +        d = twister.dhtget(u,"​avatar","​s"​)
 +        if len(d) == 1 and d[0].has_key("​p"​) and d[0]["​p"​].has_key("​v"​):​
 +            db.usernames[u].avatar = d[0]["​p"​]["​v"​]
 +
 +        print "​getting profile for", u, "​..."​
 +        d = twister.dhtget(u,"​profile","​s"​)
 +        if len(d) == 1 and d[0].has_key("​p"​) and d[0]["​p"​].has_key("​v"​):​
 +            db.usernames[u].fullname = d[0]["​p"​]["​v"​]["​fullname"​]
 +            db.usernames[u].location = d[0]["​p"​]["​v"​]["​location"​]
 +
 +        db.usernames[u].updateTime = now
 +
 +cPickle.dump(db,​open(dbFileName,"​w"​))
 +
 +
 +from HTML import HTML
 +from cgi import escape
 +def outputHtmlUserlist(fname,​ db, keys):
 +    h = HTML()
 +    head = h.head(""​)
 +    with h.body(""​):​
 +        with h.table(border='​1',​ newlines=True):​
 +            with h.colgroup:
 +                h.col(span="​1",​ style="​width:​ 64px;"​)
 +                h.col(span="​1",​ style="​width:​ 130px;"​)
 +                h.col(span="​1",​ style="​width:​ 250px;"​)
 +                h.col(span="​1",​ style="​width:​ 250px;"​)
 +            with h.tr:
 +                h.th("​avatar"​)
 +                h.th("​username"​)
 +                h.th("​fullname"​)
 +                h.th("​location"​)
 +            for u in keys:
 +                with h.tr:
 +                    with h.td():
 +                        h.img('',​src=escape(db.usernames[u].avatar),​ width="​64",​ height="​64"​)
 +                    h.td(u)
 +                    h.td(escape(db.usernames[u].fullname))
 +                    h.td(escape(db.usernames[u].location))
 +    open(fname, "​w"​).write(str(h))
 +
 +print "​Generating",​ htmlFileName,​ "​..."​
 +
 +keys = db.usernames.keys()
 +keys.sort() # sorted by username
 +outputHtmlUserlist(htmlFileName,​ db, keys)
 +
 +</​file>​
 +
 +===== posts_sync.py =====
 +
 +从html页面发贴
 +
 +<file python posts_sync.py>​
 +#​!/​usr/​bin/​python
 +#
 +# posts_sync.py example script to post from html page
 +
 +import sys, cPickle, time, urllib2
 +from pyquery import PyQuery
 +
 +reload(sys)
 +sys.setdefaultencoding("​utf-8"​)
 +
 +try:
 +    from bitcoinrpc.authproxy import AuthServiceProxy
 +except ImportError as exc:
 +    sys.stderr.write("​Error:​ install python-bitcoinrpc (https://​github.com/​jgarzik/​python-bitcoinrpc)\n"​)
 +    exit(-1)
 +
 +### options parsing
 +
 +from optparse import OptionParser
 +parser = OptionParser("​usage:​ %prog [options] <​page_url>​ <​username>"​)
 +parser.add_option("​-s",​ "​--serverUrl",​
 +                  action="​store",​ dest="​serverUrl",​ default="​http://​user:​pwd@127.0.0.1:​28332",​
 +                  help="​connect to specified twisterd server URL")
 +parser.add_option("​-p",​ "​--proxyUrl",​
 +                  action="​store",​ dest="​proxyUrl",​ default="",​
 +                  help="​proxyUrl to use")
 +parser.add_option("​-d",​ action="​store_true",​ dest="​dryRun",​
 +                  help="​dry-run,​ just report posts"​)
 +
 +(options, args) = parser.parse_args()
 +if len(args) != 2:
 +    parser.error("​incorrect number of arguments"​)
 +
 +pageUrl = args[0]
 +username = args[1]
 +
 +### connect to twisterd
 +
 +twister = AuthServiceProxy(options.serverUrl)
 +lastK = -1
 +lastUserPost = twister.getposts(1,​ [{"​username":​username}])
 +for i in range(len(lastUserPost)):​
 +    if lastUserPost[i]["​userpost"​]["​n"​] == username:
 +        lastK = int(lastUserPost[i]["​userpost"​]["​k"​])
 +        break
 +print username, "​lastK:",​ lastK
 +
 +### load db from previous run
 +
 +dbFileName = username + "​.pickle"​
 +class MyDb:
 +    lastDatatime = 0
 +try:
 +    db = cPickle.load(open(dbFileName))
 +except:
 +    db = MyDb()
 +
 +### setup proxy
 +
 +if len(options.proxyUrl):​
 +    proxy = urllib2.ProxyHandler({'​http':​ options.proxyUrl,'​https':​ options.proxyUrl})
 +    opener = urllib2.build_opener(proxy)
 +    urllib2.install_opener(opener)
 +
 +### download html content
 +
 +user_agent = '​Mozilla/​5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/​537.73.11 (KHTML, like Gecko) Version/​7.0.1 Safari/​537.73.11'​
 +headers = { '​User-Agent'​ : user_agent }
 +req = urllib2.Request(pageUrl,​ headers = headers)
 +response = urllib2.urlopen(req)
 +html = response.read()
 +pq = PyQuery(html.decode('​utf8'​))
 +
 +### parse html
 +
 +items = pq("​.content"​)
 +for i in xrange(len(items)-1,​0,​-1):​
 +    item = items.eq(i)
 +    datatime = int(item.find("​[data-time]"​).attr("​data-time"​))
 +    if datatime > db.lastDatatime :
 +        db.lastDatatime = datatime
 +        p = item.find("​p"​)
 +        ptext = p.text()
 +        ptext = ptext.replace("://​ ","://"​).replace("#​ ","#"​).replace("​@ ","​@"​)
 +        print "​newpostmsg",​ username, lastK+1, ptext
 +        if not options.dryRun:​
 +        try:
 +            twister.newpostmsg(username,​ lastK+1, ptext)
 +        except:
 +            pass
 +        lastK = lastK+1
 +
 +if not options.dryRun:​
 +    cPickle.dump(db,​open(dbFileName,"​w"​))
 +</​file>​
zh/开发/脚本示例/python.txt · 最后更改: 2014/09/28 02:26 由 xiaolan