Skip to content

Commit 82c355e

Browse files
committed
提交注释自动添加删除工具
1 parent ad612ad commit 82c355e

File tree

2 files changed

+446
-0
lines changed

2 files changed

+446
-0
lines changed

tools/pgodc_add_commet.py

Lines changed: 374 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,374 @@
1+
#coding=utf-8
2+
#!/usr/bin/python
3+
4+
import sys,os,re
5+
import logging
6+
7+
#sourceDir = r"sgml_cn"
8+
#commentDir = r"sgml_en"
9+
#destDir = r"sgml_out"
10+
file_encoding=r'UTF-8'
11+
skip_sgmls = ('legal.sgml','bookindex.sgml','errcodes-table.sgml','features-supported.sgml','features-unsupported.sgml','version.sgml')
12+
13+
COMMENT_NOTE_START=r'<!--==========================orignal english content=========================='
14+
COMMENT_NOTE_END=r'____________________________________________________________________________-->'
15+
CN_TAGS=('title','para','simpara','indexterm','term','titleabbrev','row','bookinfo','remark',
16+
'programlisting','screen','literallayout','refmeta','refnamediv','indexentry','biblioentry',
17+
'cmdsynopsis','synopsis')
18+
19+
opentag_pattern = re.compile(r'^<[a-zA-Z][a-zA-Z0-9_]*(\s+[a-zA-Z][a-zA-Z0-9_]*\s*=\s*(?P<qt>[\'"])[^\'"]*(?P=qt))*\s*>')
20+
closetag_pattern = re.compile(r'^</([a-zA-Z][a-zA-Z0-9_]*)?>')
21+
22+
23+
# 配置log
24+
logging.basicConfig(level=logging.DEBUG,
25+
format='%(lineno)4d: %(message)s',
26+
datefmt='%a, %d %b %Y %H:%M:%S',
27+
filename='sgmlparse.log',
28+
filemode='w')
29+
30+
logging.debug('This is info message')
31+
32+
class Tag():
33+
def __init__(self,name,start=0):
34+
self.name = name
35+
self.start = 0
36+
self.name_attr = name
37+
self.end = 0
38+
self.child_tags=list()
39+
self.parent = None
40+
self.line_num = 0
41+
42+
def childs(self):
43+
return self.child_tags.copy()
44+
45+
def to_str(self, skip_cntags_childs=False, with_line_num=False):
46+
return tag_to_str(self, skip_cntags_childs=skip_cntags_childs, with_line_num=with_line_num)
47+
48+
def tag_to_str(tag, indent = 0, skip_cntags_childs=False, with_line_num=False):
49+
if with_line_num:
50+
output = "%6d:" % tag.line_num
51+
else:
52+
output = ''
53+
for i in range(0,indent):
54+
output += ' '
55+
output += tag.name_attr
56+
output +='\n'
57+
# 跳过需要加注释的节点的子节点,防止中文翻译后某些tag的先后次序甚至个数发生改变,无法对比。
58+
if skip_cntags_childs and tag.name in CN_TAGS:
59+
pass
60+
else:
61+
for child in tag.child_tags:
62+
output += tag_to_str(child, indent=indent+2, skip_cntags_childs=skip_cntags_childs, with_line_num=with_line_num)
63+
return output
64+
65+
66+
class TagHandler():
67+
def __init__(self):
68+
self.top_tags = list()
69+
self.current_tag = None
70+
self.first_cn_tag = None
71+
72+
# 元素开始事件处理
73+
def startElement(self, name, ctx):
74+
logging.debug(str((ctx.filename,"startElement",name,ctx.line,ctx.col)))
75+
tag = Tag(name)
76+
tag.start = ctx.tag_start
77+
tag.name_attr = ctx.content[ctx.tag_start+1:ctx.pos]
78+
tag.line_num = ctx.line
79+
if self.current_tag:
80+
tag.parent = self.current_tag
81+
self.current_tag.child_tags.append(tag)
82+
else:
83+
self.top_tags.append(tag)
84+
self.current_tag = tag
85+
86+
# 元素结束事件处理
87+
def endElement(self, name, ctx):
88+
logging.debug(str((ctx.filename,"endElement",name,ctx.line,ctx.col)))
89+
self.current_tag.end = ctx.pos+1
90+
self.current_tag = self.current_tag.parent
91+
92+
# 注释事件处理
93+
def comment(self, content, ctx):
94+
pass
95+
96+
# 内容事件处理
97+
def characters(self, content, ctx):
98+
pass
99+
100+
# 获取解析出的tag列表
101+
def tags(self):
102+
return self.top_tags.copy()
103+
104+
class SGMLPaserContext():
105+
def __init__(self):
106+
self.tagstack=()
107+
self.content=''
108+
self.line=0
109+
self.col=0
110+
self.pos=0
111+
112+
class CommentTagContext():
113+
def __init__(self,src_content,comment_content,fdest):
114+
self.src_content = src_content
115+
self.comment_content = comment_content
116+
self.fdest = fdest
117+
self.pos=0
118+
119+
"""
120+
解析SGML tag。
121+
以下面sgml段落的例子说明。
122+
<para>xxx</para>
123+
初始状态:none
124+
<:tag_name 从后面开始是tag名,调用characters()处理前面的字符
125+
>:none 将tag压栈,调用startElement()
126+
</:endtag 调用characters()处理前面的字符
127+
>:none 调用endElement(),将tag出栈
128+
"""
129+
class SGMLPaser():
130+
def __init__(self):
131+
self.Status='none'
132+
self.tagstack=list()
133+
self.filename=''
134+
self.content=''
135+
self.line=0
136+
self.col=0
137+
self.pos=0
138+
self.tag_start=0
139+
140+
def setContentHandler(self, contentHandler):
141+
self.ctx_hander = contentHandler
142+
143+
def parse(self, filename, sourceEncoding=r'UTF-8'):
144+
fin=open(filename, mode='r', encoding=sourceEncoding)
145+
self.content=fin.read()
146+
content=self.content
147+
self.filename=filename
148+
bufpos=0
149+
self.line=1
150+
self.col=0
151+
self.pos=pos=0
152+
tagname_pos=0
153+
tagstack=list()
154+
in_escape_black=False ##已被废弃
155+
for c in content:
156+
if c == '\n':
157+
self.line+=1
158+
self.col=0
159+
if self.Status == 'none':
160+
if c == '<':
161+
if content.startswith('<!--', pos):
162+
self.Status = 'in_comment'
163+
elif content.startswith('<![CDATA[', pos) or content.startswith('<![IGNORE[', pos) or content.startswith('<![%', pos):
164+
self.Status = 'in_cdata'
165+
elif content.startswith('</', pos):
166+
match = closetag_pattern.match(content[pos:])
167+
if match:
168+
endtagname = match.groups()[0]
169+
self.Status = 'endtag'
170+
elif opentag_pattern.match(content[pos:]):
171+
self.Status = 'tag_name'
172+
tagname_pos = pos+1
173+
if self.Status != 'none' :
174+
self.ctx_hander.characters(content[bufpos:pos],self)
175+
bufpos=pos
176+
self.tag_start=pos
177+
elif self.Status == 'in_comment':
178+
if c == ">" and content[pos-2:pos+1] == "-->":
179+
self.Status = 'none'
180+
self.ctx_hander.comment(content[bufpos:pos+1],self)
181+
bufpos=pos+1
182+
elif self.Status == 'in_cdata':
183+
if c == ">" and content[pos-2:pos+1] == "]]>":
184+
self.Status = 'none'
185+
self.ctx_hander.comment(content[bufpos:pos+1],self)
186+
bufpos=pos+1
187+
elif self.Status == 'tag_name':
188+
if c in ' \r\n\t/':
189+
tagname=content[tagname_pos:pos]
190+
self.Status = 'tag_name_end'
191+
elif c == ">":
192+
tagname=content[tagname_pos:pos]
193+
tagstack.append(tagname)
194+
self.Status = 'none'
195+
bufpos=pos+1
196+
self.ctx_hander.startElement(tagname,self)
197+
if tagname in ['programlisting', 'screen', 'literal'] :
198+
in_escape_black = True
199+
elif self.Status == 'tag_name_end':
200+
if c == ">":
201+
tagstack.append(tagname)
202+
self.Status = 'none'
203+
bufpos=pos+1
204+
self.ctx_hander.startElement(tagname,self)
205+
if content[pos-1:pos] == '/':
206+
self.ctx_hander.endElement(tagstack.pop(),self)
207+
elif tagname in ['xref', 'footnoteref', 'colspec', 'spanspec', '!DOCTYPE', '!ENTITY', 'co']:
208+
self.ctx_hander.endElement(tagstack.pop(),self)
209+
elif tagname in ['programlisting', 'screen', 'literal'] :
210+
in_escape_black = True
211+
elif self.Status == 'endtag':
212+
if c == ">":
213+
self.Status = 'none'
214+
bufpos=pos+1
215+
tagname2=tagstack.pop()
216+
if endtagname != None and endtagname != tagname2:
217+
raise Exception('unmatched end tag "%s" in %s:%d, while %s is expect' % (endtagname,filename,self.line,tagname2), tagstack)
218+
self.ctx_hander.endElement(tagname2,self)
219+
in_escape_black = False
220+
pos+=1
221+
self.pos=pos
222+
self.col+=1
223+
self.ctx_hander.characters(content[bufpos:],self)
224+
225+
def process_tag(tag,commenttag,ctx):
226+
if tag.name != commenttag.name:
227+
raise Exception('tag "%s" in source sgml and tag "%s" in comment sgml does not match' % (tag.name,commenttag.name))
228+
fdest = ctx.fdest
229+
if tag.name in CN_TAGS:
230+
# 从tag开始位置往前找到第一个换行符
231+
insert_pos = pos = tag.start
232+
while True:
233+
if pos == 0:
234+
insert_pos = pos
235+
break
236+
elif ctx.src_content[pos-1] == '\n':
237+
insert_pos = pos
238+
break
239+
elif ctx.src_content[pos-1] in ' \t':
240+
pass
241+
else:
242+
break
243+
pos -= 1
244+
fdest.write(ctx.src_content[ctx.pos:insert_pos])
245+
fdest.write('%s\n' % (COMMENT_NOTE_START,))
246+
fdest.write(ctx.src_content[insert_pos:tag.start])
247+
#print(type(ctx.src_content),type(ctx.comment_content),tag.start,tag.end)
248+
comment = ctx.comment_content[commenttag.start:commenttag.end].replace(r'--',r'-&minus;')
249+
fdest.write(comment)
250+
fdest.write('\n%s\n' % (COMMENT_NOTE_END,))
251+
ctx.pos = insert_pos
252+
else:
253+
srctags = tag.childs()
254+
commenttags = commenttag.childs()
255+
if(len(srctags) != len(commenttags)):
256+
raise Exception('child tag number of tag "%s" in source sgml (%d) and comment sgml (%d) does not match' % (tag.name,len(srctags),len(commenttags)))
257+
for i in range(0,len(srctags)):
258+
process_tag(srctags[i],commenttags[i],ctx)
259+
260+
def process_file(sourceFile,commentFile,destFile):
261+
#print(sourceFile,commentFile,destFile)
262+
print(sourceFile + ': ', end='')
263+
filename = os.path.basename(sourceFile)
264+
if filename in skip_sgmls:
265+
print("skip")
266+
return
267+
# 解析sourceFile
268+
parser = SGMLPaser()
269+
srcTagHandler = TagHandler()
270+
parser.setContentHandler(srcTagHandler)
271+
parser.parse(sourceFile)
272+
# 解析commentFile
273+
parser = SGMLPaser()
274+
commetTagHandler = TagHandler()
275+
parser.setContentHandler(commetTagHandler)
276+
parser.parse(commentFile)
277+
# 比较原始tag和注释tag是否匹配
278+
str_srctags = ''
279+
for t in srcTagHandler.tags():
280+
str_srctags += t.to_str(skip_cntags_childs=True)
281+
str_commenttags = ''
282+
for t in commetTagHandler.tags():
283+
str_commenttags += t.to_str(skip_cntags_childs=True)
284+
if str_srctags != str_commenttags:
285+
# 写入tag一览
286+
open(filename + '.srctags', mode='w').write(str_srctags)
287+
open(filename + '.commenttags', mode='w').write(str_commenttags)
288+
# 写入带行号的详细tag一览
289+
str_srctags = ''
290+
for t in srcTagHandler.tags():
291+
str_srctags += t.to_str(skip_cntags_childs=True,with_line_num=True)
292+
str_commenttags = ''
293+
for t in commetTagHandler.tags():
294+
str_commenttags += t.to_str(skip_cntags_childs=True,with_line_num=True)
295+
open(filename + '.srctags_detail', mode='w').write(str_srctags)
296+
open(filename + '.commenttags_detail', mode='w').write(str_commenttags)
297+
# 输出错误消息返回
298+
print("skip for error:",end='')
299+
print ('source sgml "%s" and comment sgml "%s" does not match' %
300+
(sourceFile,commentFile))
301+
return
302+
#raise Exception('source sgml "%s" and comment sgml "%s" does not match' % (sourceFile,commentFile))
303+
# 打开文件
304+
fsrc=open(sourceFile, mode='r', encoding=file_encoding)
305+
src_content=fsrc.read()
306+
fcomment=open(commentFile, mode='r', encoding=file_encoding)
307+
comment_content=fcomment.read()
308+
destDir=os.path.dirname(destFile)
309+
if(len(destDir) > 0 and not os.path.exists(destDir)):
310+
os.makedirs(os.path.dirname(destFile))
311+
fdest=open(destFile, mode='w', encoding=file_encoding)
312+
# 遍历tag处理添加注释
313+
commentTagContext = CommentTagContext(src_content,comment_content,fdest)
314+
srctags = srcTagHandler.tags()
315+
commenttags = commetTagHandler.tags()
316+
if(len(srctags) != len(commenttags)):
317+
raise Exception('child tag number of tag "%s" in source sgml (%d) and comment sgml (%d) does not match' % (tag.name,len(srctags),len(commenttags)))
318+
for i in range(0,len(srctags)):
319+
process_tag(srctags[i],commenttags[i],commentTagContext)
320+
fdest.write(src_content[commentTagContext.pos:])
321+
print("ok")
322+
323+
def process(source,comment,dest):
324+
#print(source,comment,dest)
325+
if os.path.isdir(source):
326+
# process all files in this dir
327+
for f in os.listdir(source):
328+
source_child = os.path.join(source, f)
329+
comment_child = os.path.join(comment, f)
330+
dest_child = os.path.join(dest, f)
331+
process(source_child,comment_child,dest_child)
332+
elif(source.endswith(".sgml")):
333+
process_file(source,comment,dest)
334+
335+
336+
337+
338+
339+
if __name__ == "__main__":
340+
# 输入参数解析
341+
342+
if len(sys.argv) != 4:
343+
print ('使用方法: %s <输入sgml> <注释源sgml> <输出sgml>' % os.path.basename(sys.argv[0]))
344+
print ('参数:')
345+
print (' 输入sgml:需要添加注释的原始sgml文件,批量处理时为sgml文件所在目录')
346+
print (' 注释源sgml:注释来源的sgml,可和输入sgml相同,批量处理时为sgml文件所在目录')
347+
print (' 输出sgml:添加注释后的sgml,批量处理时为sgml文件所在目录')
348+
sys.exit(1)
349+
350+
sourceFile = sys.argv[1]
351+
commentFile = sys.argv[2]
352+
destFile = sys.argv[3]
353+
354+
if not os.path.exists(sourceFile) or not os.path.exists(commentFile):
355+
print ('"%s" 或 "%s" 不存在' % (sourceFile,commentFile))
356+
sys.exit(1)
357+
358+
if sourceFile == destFile:
359+
print ('输出sgml和输入sgml不允许相同')
360+
sys.exit(1)
361+
362+
if os.path.isfile(sourceFile):
363+
if not os.path.isfile(commentFile):
364+
print ('%s 必须是已存在的文件' % commentFile)
365+
sys.exit(1)
366+
else:
367+
if not os.path.isdir(commentFile):
368+
print ('%s 必须是已存在的目录' % commentFile)
369+
sys.exit(1)
370+
371+
process(sourceFile,commentFile,destFile)
372+
373+
374+

0 commit comments

Comments
 (0)