Benutzer:Hub/antragsfabrik-openslides-export.py
< Benutzer:Hub
Version vom 15. Juni 2012, 15:15 Uhr von Hub (Diskussion | Beiträge) (Script zum scrappen von Anträgen aus dem BB Listing in openslides csv.)
#!/usr/bin/env python # -*- coding: utf-8 -*- import urllib2 import json import os import csv
API_URL = "http://wiki.piratenbrandenburg.de/api.php" CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1", "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1", "Kategorie:Programmantrag_AF_LPT_2012.1"] MAX_PAGEIDS = 50
def get_json(endpoint): url = .join([ API_URL, '?', endpoint, '&format=json', ]) return urllib2.urlopen(url).read()
def get_category(category, query_continue=""): data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue)) json_data = json.loads(data) pages = json_data["query"]["categorymembers"] if "query-continue" in json_data: pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"]) return pages
def list_applications(categories): if os.path.isfile("application_list"): f = open('application_list','r') return json.load(f) return download_applications(categories)
def download_applications(categories): applications = _list_applications(categories) f = open('application_list','w+') json.dump(applications, f) f.flush() return applications
def _list_applications(categories): applications = {} for category in categories: pages = get_category(category) applications[category] = pages return applications
def get_raw_pageid(pageid): data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid) json_data = json.loads(data) pages = json_data["query"]["pages"] content = [] for pageids in pages: content += pages[pageids]["revisions"] return content
def chunks(l, n): for i in xrange(0, len(l), n): yield l[i:i+n]
def get_pageid(pageids): pages = [] for chunk in chunks(pageids, MAX_PAGEIDS): pages += get_raw_pageid("|".join(str(i) for i in chunk)) return pages
def _list_content(applications): pageids = {} content = {} for category in applications.iterkeys(): for application in applications[category]: if category in pageids: pageids[category] += [application["pageid"]] else: pageids[category] = [application["pageid"]] content[category] = get_pageid(pageids[category]) return content
def download_content(applications): content = _list_content(applications) f = open('content','w+') json.dump(content,f) f.flush() return content
def list_content(applications): if os.path.isfile("content"): f = open('content','r') return json.load(f) return download_content(applications)
def parse_content(content): applications = {} for category in content.iterkeys(): applications_for_category = [] for application_content in content[category]: application = mediawiki_template(application_content["*"]) if application["Eingereicht"] != "": applications_for_category.append(application) applications_for_category.sort(key = lambda x: x["Titel"]) applications[category] = applications_for_category return applications
def mediawiki_template(mw_string): """ returns media wiki template element as a hash""" #Split content inside Template strings = mw_string.split("{{")[1].split("}}")[0].split("\n|") #remove "Antragsfabrikat" strings = strings[1:] mw_hash = {} for string in strings: keyval = string.split("=",1) if 2 != len(keyval): raise SyntaxError("Mediawiki parsing Error %s" % keyval) keyval = [s.strip() for s in keyval] key, val = keyval mw_hash[key] = val return mw_hash
def filter_content(content): """ simple filter for some html tags to plain text""" content = content.replace("1","¹") content = content.replace("2","²") content = content.replace("3","³") content = content.replace("
","\n") content = content.replace("<br\>","\n") content = content.replace("<br\\n>","\n") content = content.replace("
","\n") content = content.replace("
","\n") return content
def write_content(applications): for category in applications: f = open(category,'w+') writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow( ("Number","Title","Text","Reason", "Submitter (First Name)","Submitter (Last Name)")) for a in applications[category]: writer.writerow( ( "", a["Titel"].encode('utf8'), filter_content(a["Antragstext"].encode('utf8')), filter_content(a[u'Begr\xfcndung'].encode('utf8')), a["Antragsteller"].encode('utf8'), "") ) #Last Name f.flush() f.close()
if __name__ == '__main__': #download_applications(CATEGORIES) applications = list_applications(CATEGORIES) #download_content(applications) content = list_content(applications) applications = parse_content(content) write_content(applications)