Well. your code misses some modules, as I pointed out earlier, while the proposed code misses fewer, and it is much smaller, meaning less error prone
Below is a minor updated version to deal with ol’-school {{ModuleFileTable}}
template etc.
#!/usr/bin/env python
from mwparserfromhell import parse
from json import dumps
def do_gameinfo(code):
gi = code.filter_templates(matches=lambda n : n.name=='GameInfo')
if not gi or len(gi) < 1:
raise RuntimeError('No GameInfo')
gi = gi[0]
ret = {k: str(gi.get(k)).replace(f'{k}=','') for k in
['image',
'publisher',
'year',
'era',
'topic',
'series',
'scale',
'players',
'length']
if gi.has(k)}
code.replace(gi,'')
return ret
def do_emails(text):
main = parse(text)
eml = main.filter_templates()
return [{'name': str(e.params[1]) if len(e.params) > 1 else '',
'address': str(e.params[0])}
for e in eml
if str(e.params[0]) != 'someguy@example.com'
]
def do_modules(code):
names = ['ModuleFilesTable2', # 0
'ModuleVersion2', # 1
'ModuleFile2', # 2
'ModuleFilesTable', # 3
'ModuleVersion', # 4
'ModuleFile' # 5
]
tmpl = code.filter_templates(matches=lambda n : n.name in names,
recursive=False)
tab = None
cur = None
for tm in tmpl:
if tm.name in names[0::3]:
tab = {}
continue
if tab is None:
raise RuntimeError(f'{tm.name} seen before {",".join(names[0::3])}')
if tm.name in names[1::3]:
cur = []
key = str(tm.get('version')).replace('version=','')
tab[key] = cur
continue
if cur is None:
raise RuntimeError(f'No current version')
db = {k: str(tm.get(k)).replace(f'{k}=','').replace('\u200e','')
for k in
['filename',
'decription',
'date',
'size',
'compatibility']
if tm.has(k)}
db['maintainers'] = do_emails(str(tm.get('maintainer'))
if tm.has('maintainer') else '')
db['contributors'] = do_emails(str(tm.get('contributors')
if tm.has('contributors') else ''))
cur.append(db)
for tm in tmpl:
code.replace(tm, '')
tmpl = code.filter_templates(matches=lambda n :
n.name == 'ModuleContactInfo',
recursive=False)
for tm in tmpl:
main = do_emails(str(tm.get('maintainer'))
if tm.has('maintainer') else '')
cont = do_emails(str(tm.get('contributors')
if tm.has('contributors') else ''))
for ver,cur in tab.items():
for db in cur:
if main and len(main) > 0:
if not 'maintainers' in db:
db['maintainers'] = []
db['maintainers'].extend(main)
if cont and len(cont) > 0:
if not 'contributors' in db:
db['contributors'] = []
dub['contributors'].extend(cont)
for tm in tmpl:
code.replace(tm, '')
return tab
def do_gallery(code):
tags = code.filter_tags(matches = lambda n: n.tag == 'gallery')
if not tags:
return []
def extract(e):
fields = e.split('|')
img = fields[0].replace('Image:','')
alt = '' if len(fields) < 2 else fields[1]
return {'img': img, 'alt': alt}
ret = [
extract(e)
for tag in tags
for e in tag.contents.split('\n')
if e != ''
]
for tag in tags:
code.replace(tag, '')
return ret
def do_players(code):
tags = code.filter_tags(matches = lambda n: n.tag == 'div')
if not tags:
return []
ret = [
do_emails(tag.contents) for tag in tags
if tag.contents != ''
]
for tag in tags:
code.replace(tag, '')
return ret
def do_readme(code):
from tempfile import mkstemp
from subprocess import Popen, PIPE
from os import unlink
tmp, tmpnam = mkstemp(text=True)
with open(tmp,'w') as tmpfile:
tmpfile.write(str(code))
cmd = ['pandoc',
'--from', 'mediawiki',
'--to', 'markdown-simple_tables',
tmpnam]
out,err = Popen(cmd, stdout=PIPE,stderr=PIPE).communicate()
unlink(tmpnam)
return out.decode().replace(r'\|}','').replace(r'\_\_NOTOC\_\_','')
def convert(inp,md,js):
text = inp.read()
code = parse(text)
gameinfo = do_gameinfo(code)
modules = do_modules(code)
gallery = do_gallery(code)
players = do_players(code)
readme = do_readme(code)
game = {'info': gameinfo,
'modules': modules,
'gallery': gallery,
'players': players }
js.write(dumps(game,indent=2))
md.write(readme)
if __name__ == '__main__':
from argparse import ArgumentParser, FileType
ap = ArgumentParser(description='Convert')
ap.add_argument('input',type=FileType('r'),
help='Input media wiki')
ap.add_argument('readme',type=FileType('w'),
help='Output markdow')
ap.add_argument('json',type=FileType('w'),
help='Output JSON')
args = ap.parse_args()
convert(args.input,args.readme,args.json)
Use it anyway you like, I just think you can get the job done sooner if you would use some of this.
Yours,
Christian