User:PhiLiP/bird.py

#! -*- coding: utf-8 -*-
import socks, socket
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 7070)
socket.socket = socks.socksocket
import urllib, urllib2, cookielib, re, json, subprocess, datetime, cgi, tempfile
import os

regnum_la = u'Animalia'
regnum_zh = u'动物界'

phylum_la = u'Chordata'
phylum_zh = u'脊索动物门'

classis_la = u'Aves'
classis_zh = u'鸟纲'

ordo_la = u'Passeriformes'
ordo_zh = u'雀形目'

familia_la = u'Estrildidae'
familia_zh = u'梅花雀科'

genus_la = u'Lagonosticta'
genus_zh = u'火雀属'

def global_extent( m ):
	sqkm = m.group(1)

replacements = (
	( u'<!-- This article was auto-generated by \[\[User:Polbot\]\]. -->', u'' ),
	( u'\s*is a species of [^\w]*bird[^\w]* in the [^\w]*Campephagidae[^\w]* family.',
	  u'是[[%s]][[%s]]的一种，' % ( familia_zh, genus_zh ) ),
	( u'It has an estimated global extent of occurrence of ([\d,]+) km².',
	  u'其全球活动范围有\\1平方千米。' ),
	( u'\s*It is found in ', u'分布于' ),
	( u'São Tomé and Príncipe', u'圣多美和普林西比' ),
	( u'New Caledonia', u'新喀里多尼亚' ),
	( u'Solomon Islands', u'所罗门群岛' ),
	( u'Vanuatu', u'瓦努阿图' ),
	( u'Norfolk Island', u'诺福克岛' ),
	( u'Brunei', u'文莱' ),
	( u'India', u'印度' ),
	( u'Indonesia', u'印度尼西亚' ),
	( u'Malaysia', u'马来西亚' ),
	( u'Philippines', u'菲律宾' ),
	( u'Singapore', u'新加坡' ),
	( u'Thailand', u'泰国' ),
	( u'Australia', u'澳大利亚' ),
	( u'Papua New Guinea', u'巴布亚新几内亚' ),
	( u'New Guinea', u'新几内亚' ),
	( u'Bismarck Archipelago', u'俾斯麦群岛' ),
	( u'Samoa', u'萨摩亚' ),
	( u'East Timor', u'东帝汶' ),
	( u'Angola', u'安哥拉' ),
	( u'Myanmar', u'缅甸' ),
	( u'Benin', u'贝宁' ),
	( u'Cameroon', u'喀麦隆' ),
	( u'Central African Republic', u'中非共和国' ),
	( u'Democratic Republic of the Congo', u'刚果民主共和国' ),
	( u'Republic of(?: the)? Congo', u'刚果共和国' ),
	( u'Côte d\'Ivoire', u'科特迪瓦' ),
	( u'Ivory Coast', u'科特迪瓦' ),
	( u'Equatorial Guinea', u'赤道几内亚' ),
	( u'Botswana', u'博茨瓦纳' ),
	( u'Burundi', u'布隆迪' ),
	( u'Ethiopia', u'埃塞俄比亚' ),
	( u'Malawi', u'马拉维' ),
	( u'Burkina Faso', u'布基纳法索' ),
	( u'Chad', u'乍得' ),
	( u'Eritrea', u'厄立特里亚' ),
	( u'Gambia', u'冈比亚' ),
	( u'Mozambique', u'莫桑比克' ),
	( u'Namibia', u'纳米比亚' ),
	( u'Rwanda', u'卢旺达' ),
	( u'Somalia', u'索马里' ),
	( u'Japan', u'日本' ),
	( u'South Africa', u'南非' ),
	( u'Swaziland', u'斯威士兰' ),
	( u'Zimbabwe', u'津巴布韦' ),
	( u'Gabon', u'加蓬' ),
	( u'Ghana', u'加纳' ),
	( u'Guinea-Bissau', u'几内亚比绍' ),
	( u'Guinea', u'几内亚' ),
	( u'Kenya', u'肯尼亚' ),
	( u'Liberia', u'利比里亚' ),
	( u'Mali', u'马里' ),
	( u'Ani马里a', u'Animalia' ),
	( u'Nigeria', u'尼日利亚' ),
	( u'Sierra Leone', u'塞拉利昂' ),
	( u'Sudan', u'苏丹' ),
	( u'Tanzania', u'坦桑尼亚' ),
	( u'Togo', u'多哥' ),
	( u'Uganda', u'乌干达' ),
	( u'Zambia', u'赞比亚' ),
	( u'Mauritania', u'毛里塔尼亚' ),
	( u'Niger', u'尼日尔' ),
	( u'Senegal', u'塞内加尔' ),
	#( u'', u'' ),
	( u'coastal hinterland', u'沿海腹地' ),
	( u'hinterland', u'腹地' ),
	( u'subspecies', u'亚种' ),
	( u'extinct', u'绝灭' ),
	( u'Its natural (?:\[\[)?habitats?(?:\]\])?s? (?:are|is)', u'其自然[[栖息地]]为' ),
	( u' ?subtropical or tropical', u'亚热带或热带的' ),
	( u' ?moist', u'湿润' ),
	( u' ?dry', u'干燥' ),
	( u' ?(?:\[\[)?lowland(?:\]\])?', u'低地' ),
	( u' ?(?:\[\[)?mangrove (?:\[\[)?forests?(?:\]\])?s?', u'[[红树林]]' ),
	( u' ?((?:\[\[)?)forests?((?:\]\])?)s?', u'\\1森林\\2' ),
	( u' ?((?:\[\[)?)montanes?((?:\]\])?)s?', u'\\1山地\\2' ),
	( u' ?((?:\[\[)?)plantations?((?:\]\])?)s?', u'\\1人造林\\2' ),
	( u' ?((?:\[\[)?)swamps?((?:\]\])?)s?', u'\\1沼泽\\2' ),
	( u' ?(?:\[\[)?shrublands?(?:\]\])?s?', u'{{link-en|疏灌丛|shrubland}}' ),
	( u' ?((?:\[\[)?)savannas?((?:\]\])?)s?', u'\\1疏林草原\\2' ),
	( u'\[\[John Gould\|Gould\]\]', u'[[约翰·古尔德|Gould]]' ),
	( u'\[\[Coenraad Jacob Temminck\|Temminck\]\]', u'[[康拉德·雅各·特明克|Temminck]]' ),
	( u'\[\[Raymond Robert Forster\|Forster\]\]', u'{{link-en|雷蒙德·罗伯特·福斯特|Raymond Robert Forster|Forster}}' ),
	( u'\[\[Emile Oustalet\|Oustalet\]\]', u'{{link-en|埃米尔·乌斯塔莱|Emile Oustalet|Oustalet}}' ),
	( u'\[\[Louis Jean Pierre Vieillot\|Vieillot\]\]',
	  u'{{link-en|路易·让·皮埃尔·维埃约|Louis Jean Pierre Vieillot|Vieillot}}' ),
	( u'\[\[John Latham \(ornithologist\)\|Latham\]\]',
	  u'{{link-en|约翰·莱瑟姆|John Latham (ornithologist)|Latham}}' ),
	( u'\[\[John Cassin\|Cassin\]\]', u'{{link-en|约翰·卡辛|John Cassin|Cassin}}' ),
	( u'\[\[Hugh Edwin Strickland\|Strickland\]\]',
	  u'{{link-en|休·埃德温·斯特里克兰|Hugh Edwin Strickland|Strickland}}' ),
	( u'\[\[Louis Fraser\|Fraser\]\]',
	  u'{{link-en|路易斯·弗雷泽|Louis Fraser|Fraser}}' ),
	( u'\[\[Gustav Hartlaub\|Hartlaub\]\]',
	  u'{{link-en|古斯塔夫·哈特劳布|Gustav Hartlaub|Hartlaub}}' ),
	( u'\[\[Ernst Hartert\|Hartert\]\]',
	  u'{{link-en|恩斯特·哈特尔特|Ernst Hartert|Hartert}}' ),
	( u'\[\[Boyd Alexander\|Alexander\]\]',
	  u'{{link-en|博伊德·亚历山大|Boyd Alexander|Alexander}}' ),
	( u'\[\[Theodor von Heuglin\|Heuglin\]\]',
	  u'{{link-en|特奥多尔·冯·霍伊格林|Theodor von Heuglin|Heuglin}}' ),
	( u'Carolus Linnaeus', u'卡尔·林奈' ),
	( u'It is threatened by \[\[habitat loss\]\].', u'该物种受栖息地减少威胁。' ),
	( u'The status of the species is evaluated as \[\[Least Concern\]\].',
	  u'该物种的保护状况被评为[[无危]]。' ),
	( u'\]\], (?:the )?\[\[', u']]、[[' ),
	( u'\]\],? (?:and|&) \[\[', u']]和[[' ),
	( u'\]\]\. ', u']]。' ),
	( u' and ?', u'以及' ),
)

def get_chinese( text ):
	params = {
		'txt': text.encode( 'utf-8' ),
		's': 'Search' }
	fp = urllib.urlopen( 'http://www.cnbird.org.cn/world/result.asp?' + urllib.urlencode( params ) )
	data = fp.read().decode( 'gbk' )
	ret = re.findall( u'<tr class=tr1><td>.+?</td><td>(.+?)</td><td>', data )
	if ret:
		ret = ret[0]
	else:
		ret = ''
	return ret

def exists( title ):
	params = {
		'action': 'query',
		'titles': title.encode( 'U8' ),
		'format': 'json',
	}
	fp = urllib.urlopen( 'http://zh.wikipedia.org/w/api.php?' + urllib.urlencode( params ) )
	data = json.load( fp )
	if data['query']['pages'].has_key( '-1' ):
		return False
	else:
		return True

cj = cookielib.CookieJar()
opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cj ) )
def update_iucnid( text ):
	global opener
	params = {
		'mode': '',
		'text': text.encode( 'utf-8' ),
		'x': 15,
		'y': 5
	}
	fp = opener.open( 'http://www.iucnredlist.org/apps/redlist/search/external', urllib.urlencode( params ) )
	data = fp.read().decode( 'utf-8' )
	ret = re.findall( u'<a href=".+?(\d+).+?class="sciname">%s' % text, data )
	if ret:
		ret = ret[0]
	else:
		ret = ''
	return ret

def build_iucn( binomial ):
	tpl = u'{{IUCN2010 | assessors = BirdLife International | year = 2009 | id = %s | title = ' \
		+ '%s | version = 2010.3 | downloaded = %s }}'
	iucnid = update_iucnid( binomial )
	if iucnid:
		utcnow = datetime.datetime.utcnow()
		utcnow = '%4d-%02d-%02d' % ( utcnow.year, utcnow.month, utcnow.day )
		ret = tpl % ( iucnid, binomial, utcnow )
	else:
		ret = ''
	return ret

def interwikis( m, english ):
	ori = m.group(0)
	ori = ori.split('\n')
	new = []
	for _item in ori:
		_item = _item.strip()
		if _item:
			new.append( _item )
	new.append( u'[[en:%s]]' % english )
	new.sort()
	return u'\n\n' + u'\n'.join( new )

def gen():
	global genus_la
	params = {
		'action': 'query',
		'prop': 'revisions',
		'generator': 'categorymembers',
		'gcmtitle': 'Category:%s' % genus_la,
		'prop': 'revisions',
		'rvprop': 'content',
		'format': 'json',
		'gcmlimit': 10 }
	
	_p = params.copy()

	while 1:
		data = json.load( urllib.urlopen( 'http://en.wikipedia.org/w/api.php?' + urllib.urlencode( _p ) ) )
		if data.has_key( 'query' ):
			for _item in data['query']['pages'].itervalues():
				title = _item['title']
				text = _item['revisions'][0]['*']
				yield ( title, text )
		if data.has_key( 'query-continue' ):
			_p = params.copy()
			_p['gcmcontinue'] = data['query-continue']['categorymembers']['gcmcontinue']
		else:
			break

def do_translate( title, text ):
	global replacements, regnum_la, regnum_zh, phylum_la, phylum_zh, genus_la, genus_zh
	global classis_la, classis_zh, ordo_la, ordo_zh, familia_la, familia_zh
	binomial = re.findall( u'\|\s*binomial\s*=\s*\'\'(.+?)\'\'', text )
	if binomial:
		binomial = binomial[0]
	else:
		return ( '', '', '' )
	
	chinese = ''
	if binomial:
		chinese = get_chinese( binomial )

	while not chinese:
		chinese = raw_input( ( u'请输入%s的中文名称：' % binomial ).encode( 'GBK' ) )
		chinese = chinese.decode( 'GBK' )

	if exists( chinese ):
		print ( u'条目[[%s]]已存在，自动跳过' % chinese ).encode( 'GBK' )
		return ( '', '', '' )
		while 1:
			result = raw_input( ( u'条目[[%s]]已存在，是（y）否（n）跳过？' % chinese ).encode( 'U8' ) )
			if result.lower() == 'y':
				return ( '', '', '' )
			elif result.lower() == 'n':
				break

	text = re.sub( u'(\|\s*name\s*=\s*)[^\|\n\r]*', u'\\1%s' % chinese, text )
	text = re.sub( u'(\|\s*species\s*=\s*\'\'\')', u'\\1%s ' % chinese, text )
	text = re.sub( u'(The \'\'\').+?(\'\'\')', u'\\1%s\\2' % chinese, text )

	global replacements, regnum_la, regnum_zh, phylum_la, phylum_zh, genus_la, genus_zh
	global classis_la, classis_zh, ordo_la, ordo_zh, familia_la, familia_zh

	text = re.sub( u'(\|\s*regnum\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( regnum_zh, regnum_la ), text )
	text = re.sub( u'(\|\s*phylum\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( phylum_zh, phylum_la ), text )
	text = re.sub( u'(\|\s*classis\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( classis_zh, classis_la ), text )
	text = re.sub( u'(\|\s*ordo\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( ordo_zh, ordo_la ), text )
	text = re.sub( u'(\|\s*familia\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] %s' % ( familia_zh, familia_la ), text )
	text = re.sub( u'(\|\s*genus\s*=\s*)[^\|\n\r]*', u'\\1[[%s]] \'\'%s\'\'' % ( genus_zh, genus_la ), text )

	text = re.sub( u'The \'\'\'(.*?)\'\'\' \(\'\'\'(.*?)\'\'\'\)', u'\'\'\'\\1\'\'\'（学名：{{lang|la|\\2}}）', text )
	text = re.sub( u'The \'\'\'(.*?)\'\'\' \((\'\'.*?\'\')\)',
				   u'\'\'\'\\1\'\'\'（学名：{{lang|la|\\2}}）是[[%s]][[%s]]的一种，' % ( familia_zh, genus_zh ), text )
	
	text = re.compile( u'==\s*(?:Foot)?Notes?\s*==', re.I ).sub( u'== 脚注 ==', text )
	text = re.compile( u'==\s*References?\s*==', re.I ).sub( u'\n== 参考文献 ==', text )
	text = re.compile( u'==\s*External links?\s*==', re.I ).sub( u'== 外部链接 ==', text )
	
	text = re.compile( u'\s*\{\{[^\}]+stub\}\}\s*', re.I ).sub( '', text )
	text = re.compile( u'\[\[Category:%s\]\]' % genus_la, re.I ).sub( u'[[Category:%s]]' % genus_zh, text )
	text = re.compile( u'\[\[Category:Birds of .+?\]\]', re.I ).sub( u'', text )
	text = re.sub( u'(?:\s*\[\[[a-z]+:.+?\]\])+', lambda m: interwikis( m, title ), text )
	
	text = text.split( u'== 参考文献 ==' )
	
	if len( text ) == 2:
		iucn = build_iucn( binomial )
		print iucn.decode('U8').encode('GBK')
		if iucn:
			text[1] = re.compile( u'((?:^\*)?).*iucn.*', re.I ).sub( iucn, text[1] )
	
	#text[0] = re.compile( '%s' % title, re.I ).sub( chinese, text[0] )
	
	for rp in replacements:
		try:
			if len( rp ) == 3:
				text[0] = re.compile( rp[0], rp[2] ).sub( rp[1], text[0] )
			else:
				text[0] = re.compile( rp[0], re.I ).sub( rp[1], text[0] )
		except Exception, e:
			print rp.decode('U8').encode('GBK')
			raise
	text = u'== 参考文献 =='.join( text )

	return ( chinese, binomial, text )

def build_html( title, text ):
	title = title.encode( 'utf-8' )
	text = text.encode( 'utf-8' )
	query = urllib.urlencode( { 'title': title } )
	html = '''<html>
	<header>
		<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
		<script language="javascript">
			window.onload = function() {
				document.getElementById( 'wiki' ).submit();
			}
		</script>
	</header>
	<body>
		<form style="display:none;" id="wiki" action="http://zh.wikipedia.org/w/index.php?%s&action=submit" method="post">
			<textarea name="wpTextbox1">%s</textarea>
		</form>
	</body>
</html>''' % ( query, cgi.escape( text ) )
	return html

def main():
	tmp = None
	for ( english, oritext ) in gen():

		( chinese, binomial, midtext ) = do_translate( english, oritext )
		
		if not binomial:
			continue
		
		html = build_html( chinese, midtext )
		
		if tmp:
			while 1:
				result = raw_input( '完成发表后请输入y，中止输入n：'.decode( 'U8' ).encode( 'GBK' ) )
				if result.lower() == 'y':
					break
				elif result.lower() == 'n':
					return
			#tmp.close()
		
		tmp = open( 'tmpfile', 'wb' )
		tmp.write( html )
		tmp.close()
		os.system( 'firefox %s' % tmp.name )

if __name__ == '__main__':
	main()