Hi again,
Again looking at Napoleon at Waterloo I see you fixed up the header thing. However, the tables are formatted as HTML rather than Markdown.
Are you “rolling your own” parsing? If so, why? It seems much more reasonable to me to use existing tools such as pandoc
, possibly with some pre-parsing in Python (as in my previous message) or the like. I guess you are splitting the Wiki pages into several database tables - one for “packages” (shouldn’t it be “modules”?), one for README.md
, and so on.
Perhaps something like the below would do most of what you need:
#!/usr/bin/env python
from mwparserfromhell import parse
from json import dumps
def do_gameinfo(code):
gi = code.filter_templates(matches=lambda n : n.name=='GameInfo')
if not gi or len(gi) < 1:
raise RuntimeError('No GameInfo')
gi = gi[0]
ret = {k: str(gi.get(k)) for k in
['image',
'publisher',
'year',
'era',
'topic',
'series',
'scale',
'players',
'length']
if gi.has(k)}
code.replace(gi,'')
return ret
def do_emails(text):
main = parse(text)
eml = main.filter_templates()
return [{'name': str(e.params[1]),
'address': str(e.params[0])}
for e in eml]
def do_modules(code):
names = ['ModuleFilesTable2',
'ModuleVersion2',
'ModuleFile2']
tmpl = code.filter_templates(matches=lambda n : n.name in names,
recursive=False)
tab = None
cur = None
for tm in tmpl:
if tm.name == names[0]:
tab = {}
continue
if tab is None:
raise RuntimeError(f'{tm.name} seen before {names[0]}')
if tm.name == names[1]:
cur = {}
tab[str(tm.get('version'))] = cur
continue
if cur is None:
raise RuntimeError(f'No current version')
cur.update({k: str(tm.get(k)) for k in
['filename',
'decription',
'date',
'size',
'compatibility']
if tm.has(k)})
cur['maintainers'] = do_emails(str(tm.get('maintainer'))
if tm.has('maintainer') else '')
cur['contributors'] = do_emails(str(tm.get('contributors')
if tm.has('contributors') else ''))
for tm in tmpl:
code.replace(tm, '')
return tab
def do_gallery(code):
tags = code.filter_tags(matches = lambda n: n.tag == 'gallery')
if not tags:
return []
def extract(e):
fields = e.split('|')
img = fields[0].replace('Image:','')
alt = '' if len(fields) < 2 else fields[1]
return {'img': img, 'alt': alt}
ret = [
extract(e)
for tag in tags
for e in tag.contents.split('\n')
if e != ''
]
for tag in tags:
code.replace(tag, '')
return ret
def do_players(code):
tags = code.filter_tags(matches = lambda n: n.tag == 'div')
if not tags:
return []
ret = [
do_emails(tag.contents) for tag in tags
if tag.contents != ''
]
for tag in tags:
code.replace(tag, '')
return ret
def do_readme(code):
from tempfile import mkstemp
from subprocess import Popen, PIPE
from os import unlink
tmp, tmpnam = mkstemp(text=True)
with open(tmp,'w') as tmpfile:
tmpfile.write(str(code))
cmd = ['pandoc',
'--from', 'mediawiki',
'--to', 'markdown-simple_tables',
tmpnam]
out,err = Popen(cmd, stdout=PIPE,stderr=PIPE).communicate()
unlink(tmpnam)
return out.decode().replace(r'\|}','')
def convert(inp,md,js):
text = inp.read()
code = parse(text)
gameinfo = do_gameinfo(code)
modules = do_modules(code)
gallery = do_gallery(code)
players = do_players(code)
readme = do_readme(code)
game = {'info': gameinfo,
'modules': modules,
'gallery': gallery,
'players': players }
js.write(dumps(game,indent=2))
md.write(readme)
if __name__ == '__main__':
from argparse import ArgumentParser, FileType
ap = ArgumentParser(description='Convert')
ap.add_argument('input',type=FileType('r'),
help='Input media wiki')
ap.add_argument('readme',type=FileType('w'),
help='Output markdow')
ap.add_argument('json',type=FileType('w'),
help='Output JSON')
args = ap.parse_args()
convert(args.input,args.readme,args.json)
Give an input MediaWiki file and two output files - the Markdown README
and a JSON for the game data.
Yours,
Christian