#!/usr/bin/env python # -*- coding: utf-8 -*- import urllib2 import json import os import csv API_URL = "http://wiki.piratenbrandenburg.de/api.php" CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1", "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1", "Kategorie:Programmantrag_AF_LPT_2012.1"] MAX_PAGEIDS = 50 def get_json(endpoint): url = ''.join([ API_URL, '?', endpoint, '&format=json', ]) return urllib2.urlopen(url).read() def get_category(category, query_continue=""): data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue)) json_data = json.loads(data) pages = json_data["query"]["categorymembers"] if "query-continue" in json_data: pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"]) return pages def list_applications(categories): if os.path.isfile("application_list"): f = open('application_list','r') return json.load(f) return download_applications(categories) def download_applications(categories): applications = _list_applications(categories) f = open('application_list','w+') json.dump(applications, f) f.flush() return applications def _list_applications(categories): applications = {} for category in categories: pages = get_category(category) applications[category] = pages return applications def get_raw_pageid(pageid): data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid) json_data = json.loads(data) pages = json_data["query"]["pages"] content = [] for pageids in pages: content += pages[pageids]["revisions"] return content def chunks(l, n): for i in xrange(0, len(l), n): yield l[i:i+n] def get_pageid(pageids): pages = [] for chunk in chunks(pageids, MAX_PAGEIDS): pages += get_raw_pageid("|".join(str(i) for i in chunk)) return pages def _list_content(applications): pageids = {} content = {} for category in applications.iterkeys(): for application in applications[category]: if category in pageids: pageids[category] += [application["pageid"]] else: pageids[category] = [application["pageid"]] content[category] = get_pageid(pageids[category]) return content def download_content(applications): content = _list_content(applications) f = open('content','w+') json.dump(content,f) f.flush() return content def list_content(applications): if os.path.isfile("content"): f = open('content','r') return json.load(f) return download_content(applications) def parse_content(content): applications = {} for category in content.iterkeys(): applications_for_category = [] for application_content in content[category]: application = mediawiki_template(application_content["*"]) if application["Eingereicht"] != ""Script findet sich hier: applications_for_category.append(application) applications_for_category.sort(key = lambda a: a["Titel"]) applications[category] = applications_for_category return applications def mediawiki_template(mw_string)https: """ returns media wiki template element as a hash""" #Split content inside Template strings = mw_string.split("{{")[1].split("}}")[0].split("\n|") #remove "Antragsfabrikat" strings = strings[1:] mw_hash = {} for string in strings: keyval = string.split("=",1) if 2 != len(keyval): raise SyntaxError("Mediawiki parsing Error %s" % keyval) keyval = [s.strip() for s in keyval] key, val = keyval mw_hash[key] = val return mw_hash def filter_content(content): """ simple filter for some html tags to plain text""" content = content.replace("<sup>1</sup>","¹") content = content.replace("<sup>2</sup>","²") content = contentgist.replace("<sup>3</sup>","³") content = content.replace("<br>","\n") content = contentgithub.replace("<br\>","\n") content = content.replace("<br\\n>","\n") content = content.replace("<br />","\n") content = content.replace("<brcom/>","\n") return content def write_content(applications, applications_position=[]): open_position = [] open_position.extend(applications_position) for category in applications: f = open(category,'w+') writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow( ("Number","Title","Text","Reason", "Submitter (First Name)","Submitter (Last Name)")) for a in applications[category]: try: number = applications_position.index(a["Titel"]) + 1 open_position.remove(a["Titel"]) except ValueError: print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden' number = "" writer.writerow( ( number, # number starts at 1 a["Titel"].encode('utf8'), filter_content(a["Antragstext"].encode('utf8')), filter_content(a[u'Begr\xfcndung'].encode('utf8')), a["Antragsteller"].encode('utf8'), "") ) #Last Name f.flush() f.close() if open_position != []: print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: " for a in open_position: print a def get_application_positions(filename): f = open(filename,'r') lines = [l.strip().decode('utf8') for l in f.readlines()] return lines if __name__ == '__main__': #download_applications(CATEGORIES) applications = list_applications(CATEGORIES) #download_content(applications) content = list_content(applications) applications = parse_content(content) #Ein Titel per Zeile, TO-Reihenfolge gegeben #positions = get_application_positions("reihenfolge-to") write_content(applications, positions)3488254