User:PhiLiP/ZhConversion.py

维基百科,自由的百科全书
# -*- coding: utf-8  -*-
# file: ZhConversion.py

import re
import zipfile
import codecs

def getUnihan(unihan_fname='Unihan.zip'):
	try:
		unihanzipfile = zipfile.ZipFile(unihan_fname, 'r')
		data = unihanzipfile.read('Unihan.txt')
		print('Unihan.zip found.')
	except IOError:
		print('Unihan.zip not found.')
		return False
	return data

def getConversionTable(ctable_fname):
	try:
		conversiontable = codecs.open(ctable_fname, 'r', 'utf-8')
		data = conversiontable.read()
		print(ctable_fname + ' found.')
	except IOError:
		print(ctable_fname + ' not found.')
		return False
	return data

def getConversionTableDiff(ctablediff_fname):
	try:
		conversiontablediff = codecs.open(ctablediff_fname + '.diff', 'r', 'utf-8')
		data = conversiontablediff.read()
	except IOError:
		data = False
	return data

def patchConversionTable(orig, diff, variant):
	print('  ' + variant + ".diff found, try to merge with file " + variant + '...')
	origlines = orig.splitlines()
	origlines[0] = origlines[0].replace(u'\ufeff', '')
	difflines = diff.splitlines()
	i = 0
	for diffline in difflines:
		if diffline.startswith('@@'):
			i = int(diffline.split()[2].split(',')[0]) - 1
		elif diffline.startswith(' '):
			if diffline[1:] == origlines[i]:
				i += 1
			else:
				print('  ' + variant + ".diff can't merge with file " + variant + '.')
				return False
		elif diffline.startswith('+') and not diffline.startswith('+++'):
			origlines.insert(i, diffline[1:])
			i += 1
		elif diffline.startswith('-') and not diffline.startswith('---'):
			if diffline[1:] == origlines[i]:
				origlines.pop(i)
			else:
				print('  ' + variant + ".diff can't merge with file " + variant + '.')
				return False
	orig = '\n'.join(origlines)
	return orig

def getDictFromUnihan(variant):
	unihanfile = getUnihan(unihan_fname='Unihan.zip')
	elems = unihanfile.splitlines()
	to = {}
	sept = '\t' + variant + '\t'
	for elem in elems:
		left, sep, right = elem.partition(sept)
		if sep == sept:
			right = right.split()
			right = right[0]
			if left != right:
				to[ucs4chr(int(left[2:],16))] = ucs4chr(int(right[2:],16))
	return to

def getDictFromConversionTable(to, variant):
	conversiontable = getConversionTable(ctable_fname = variant)
	conversiontablediff = getConversionTableDiff(ctablediff_fname = variant)
	if conversiontablediff:
		conversiontb = patchConversionTable(conversiontable, conversiontablediff, variant)
		if conversiontb:
			saveConversionTable(variant, conversiontb)
			conversiontable = conversiontb
	p = re.compile('-\{([\s\S]*?)\}-')
	conversionslist = p.findall(conversiontable)
	elems = []
	for conversions in conversionslist:
		elems += conversions.splitlines()
	for elem in elems:
		left, sep, right = elem.partition('=>')
		if sep == '=>':
			left = left.replace('*','').strip()
			right = right.partition('//')[0].replace(';','').strip()
			if left in to:
				if left == right:
					to.pop(left)
				else:
					to[left] = right
			else:
				to[left] = right
	return to

def toHansDict():
	toHans = getDictFromUnihan('kSimplifiedVariant')
	toHans = getDictFromConversionTable(toHans, 'Zh-hans')
	return toHans

def toHantDict():
	toHant = getDictFromUnihan('kTraditionalVariant')
	toHant = getDictFromConversionTable(toHant, 'Zh-hant')
	return toHant

def toOtherDict(variant):
	toOther = {}
	toOther = getDictFromConversionTable(toOther, variant)
	return toOther

def getConversionCode(to):
	CString = ''
	for left, right in sorted(to.items(), key=lambda d: d[0]):
		CString += '"' + left + '" => "' + right + '",\n'
	return CString

def saveFile(toHant, toHans, toTW, toHK, toCN, toSG):
	CString = u'<?php\n/**\n * Simplified / Traditional Chinese conversion tables\n *' \
 			+ u'\n * Automatically generated using code and data in includes/zhtable/' \
 			+ u'\n * Do not modify directly!\n */\n\n'
	zhConversion = codecs.open('ZhConversion.php', 'w', 'utf-8')
	CString += u'$zh2Hant = array(\n'
	CString += getConversionCode(toHant)
	CString += u');\n\n$zh2Hans = array(\n'
	CString += getConversionCode(toHans)
	CString += u');\n\n$zh2TW = array(\n'
	CString += getConversionCode(toTW)
	CString += u');\n\n$zh2HK = array(\n'
	CString += getConversionCode(toHK)
	CString += u');\n\n$zh2CN = array(\n'
	CString += getConversionCode(toCN)
	CString += u');\n\n$zh2SG = array(\n'
	CString += getConversionCode(toSG)
	CString += u');'
	zhConversion.write(CString)
	print ('ZhConversion.php created / updated successfully.')
	zhConversion.close()

def saveConversionTable(variant, conversiontable):
	conversiontablefile = codecs.open(variant + '_new', 'w', 'utf-8')
	conversiontablefile.write(conversiontable)
	print ('  ' + variant + '_new created.')
	conversiontablefile.close()

def ucs4chr(codepoint):
	try:
		return unichr(codepoint)
	except ValueError:
		hi, lo = divmod (codepoint-0x10000, 0x400)
		return unichr(0xd800+hi) + unichr(0xdc00+lo)

def ucs4ord(str):
	if len(str)==1:
		return ord(str)
	if len(str)==2:
		hi, lo = ord(str[0])-0xd800, ord(str[1])-0xdc00
		return hi*0x400+0x10000
	raise TypeError("ucs4ord() expected a valid ucs4 character")

toHant = toHantDict()
toHans = toHansDict()
toTW = toOtherDict('Zh-tw')
toHK = toOtherDict('Zh-hk')
toCN = toOtherDict('Zh-cn')
toSG = toOtherDict('Zh-sg')
saveFile(toHant, toHans, toTW, toHK, toCN, toSG)