抓涨工资文章的脚本

2009-11-22 19:37

涨工资是张佳玮的别名.

张佳玮是国内比较大的篮球论坛Hoopchina上的知名写手.

从06年开始粉涨工资的文章, 一直想着要写脚本把他在Hoopchina上的文章全部爬下来, 放进手机看.

没有特别认真去做善后处理, 里面bug估计不少, 由于我的要求是凑活着能用, 因此就不再改来改去了...

#!/usr/bin/env python
#coding=utf-8
"""
Author:         Xia Kai <xiaket@gmail.com>
Filename:       get_zjw.py
Type:           Utility
Last modified: 2009-11-19 11:42

Description:
This script would retrieve all posts by Zhang Jiangwei, a famous 
author on a basketball forum in China.

Salute to Zhang Jiawei! Thank you for all those wonder posts.
"""
from math import ceil
from os import access, F_OK
from string import Template
from urllib import urlopen
from re import sub, compile


BASE_URL = "http://my.hoopchina.com/zhangjiawei/blog"
CACHE_FILENAME = Template('${post_number}.cache')


def get_post_number():
"""
    This function would get the number of posts by ZJW.
    """
print "Retrieving POST Number"
home_page = urlopen(BASE_URL)
lines = home_page.readlines()
for line in lines:
line = line.decode("GB2312", "ignore").encode("UTF-8")
if line.startswith('<div class="page">'):
start = line.find("共") + 3
end = line.find("篇日志")
return int(line[start:end])


def get_post_list_from_url(url):
"""
    For every url given as parameter, there are several pages,
    we shall get all the post links relative to the url.
    """
print "Generating POST list from %s." % url
list_file = urlopen(url)
lines = list_file.readlines()
is_critical = False
post_list = []
for line in lines:
line = line.decode("gb2312", "ignore").encode("utf8").lstrip()
if is_critical:
# This line contain critical information about the post.
post_dict = {}
link_start = line.index('href="') + 6
link_end = line.index('"', link_start)
title_start = line.index('title="') + 7
title_end = line.index('"', title_start)
date_start = line.index('</a>', title_end) + 4
date_end = line.index("</p>", date_start)
link = line[link_start:link_end]
title = line[title_start:title_end]
date = line[date_start:date_end]
post_dict["title"] = title.replace('/', "/")
post_dict["link"] = link
post_dict["date"] = date
is_critical = False
post_list.append(post_dict)
elif line.startswith('<div class="log_con">'):
is_critical = True
return post_list


def caching_post(post_number, POSTS):
"""
    This function would write the list of dictionaries POSTS to a file.
    """
filename = CACHE_FILENAME.substitute(post_number = post_number)
file = open(filename, 'w')
print "writing cache file."
for post_dict in POSTS:
file.write("%s$$$%s$$$%sn" % (
post_dict['title'],
post_dict['date'],
post_dict['link'],
)
)


def reading_cache(post_number):
"""
    This function would try to read POSTS cache from file.
    If cache exist, we shall return 0 and the POSTS, or else 
    we shall return 1 and an empty list.
    """
POSTS = []
filename = CACHE_FILENAME.substitute(post_number = post_number)
if access(filename, F_OK):
# cache file exist. Read it.
print "Reading cached POSTS list."
file = open(filename, 'r')
lines = file.readlines()
for line in lines:
line_list = line.split("$$$")
post_dict = {}
post_dict["title"] = line_list[0]
post_dict["date"] = line_list[1]
post_dict["link"] = line_list[2]
POSTS.append(post_dict)
return 0, POSTS
else:
return 1, POSTS


def write_post(url, file):
"""
    This function would extract the content of a hoopchina webpage.
    """
home_page = urlopen(url)
lines = home_page.readlines()
for index, line in enumerate(lines):
line = line.decode("GB2312", "ignore").encode("UTF-8").strip()
if line.startswith('<div class="title">'):
content_start = index
elif line.find('点此关注他/她的动态') != -1:
content_end = index
for line in lines[content_start:content_end]:
line = line.decode("GB2312", "ignore").encode("UTF-8").strip()
line = line.replace("<BR>", "rn").replace("</BR>", "rn")
line = line.replace("<br />", "rn").replace("<br>", "rn")
line = line.replace(" ", "").replace("</div>", "")
html_tag = compile(r'<[^>]+>')
line = sub(html_tag, "", line)
file.write(line + "rn")
return 0


def main():
"""
    This file would grab all posts by Zhang Jiawei and 
    save them into individual files.
    """
post_number = get_post_number()
print "Would generate %s posts." % post_number
status, POSTS = reading_cache(post_number)
if status == 1:
post_pages = int(ceil(post_number / 10.0))
for index in range(1, post_pages + 1):
url = BASE_URL + "-%s" % index
post_list = get_post_list_from_url(url)
POSTS += post_list
# Write POSTS to a file so it may be reused.
caching_post(post_number, POSTS)
print "Done getting post list."
else:
print "Using cached post list."

for post_dict in POSTS:
filename = "%s__%s.txt" % (post_dict['date'], post_dict['title'])
if access(filename, F_OK):
print "File %s exist, skipping." % filename
continue
file = open(filename, 'w')
print "Writing post: %s" % filename
status = write_post(post_dict['link'], file)
file.close()
if status != 0:
import sys
sys.exit(0)


if __name__ == "__main__":
main()