|
|
Zeile 1: |
Zeile 1: |
− | #!/usr/bin/env python
| + | Script findet sich hier: |
− | # -*- coding: utf-8 -*-
| + | https://gist.github.com/3488254 |
− | import urllib2
| |
− | import json
| |
− | import os
| |
− | import csv
| |
− | | |
− | API_URL = "http://wiki.piratenbrandenburg.de/api.php"
| |
− | CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
| |
− | "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
| |
− | "Kategorie:Programmantrag_AF_LPT_2012.1"]
| |
− | MAX_PAGEIDS = 50
| |
− | | |
− | def get_json(endpoint):
| |
− | url = ''.join([
| |
− | API_URL,
| |
− | '?',
| |
− | endpoint,
| |
− | '&format=json',
| |
− | ])
| |
− | return urllib2.urlopen(url).read()
| |
− | | |
− | def get_category(category, query_continue=""):
| |
− | data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
| |
− | json_data = json.loads(data)
| |
− | pages = json_data["query"]["categorymembers"]
| |
− | if "query-continue" in json_data:
| |
− | pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
| |
− | return pages
| |
− | | |
− | def list_applications(categories):
| |
− | if os.path.isfile("application_list"):
| |
− | f = open('application_list','r')
| |
− | return json.load(f)
| |
− | return download_applications(categories)
| |
− | | |
− | def download_applications(categories):
| |
− | applications = _list_applications(categories)
| |
− | f = open('application_list','w+')
| |
− | json.dump(applications, f)
| |
− | f.flush()
| |
− | return applications
| |
− | | |
− | def _list_applications(categories):
| |
− | applications = {}
| |
− | for category in categories:
| |
− | pages = get_category(category)
| |
− | applications[category] = pages
| |
− | return applications
| |
− | | |
− | def get_raw_pageid(pageid):
| |
− | data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
| |
− | json_data = json.loads(data)
| |
− | pages = json_data["query"]["pages"]
| |
− | content = []
| |
− | for pageids in pages:
| |
− | content += pages[pageids]["revisions"]
| |
− | return content
| |
− | | |
− | def chunks(l, n):
| |
− | for i in xrange(0, len(l), n):
| |
− | yield l[i:i+n]
| |
− | | |
− | def get_pageid(pageids):
| |
− | pages = []
| |
− | for chunk in chunks(pageids, MAX_PAGEIDS):
| |
− | pages += get_raw_pageid("|".join(str(i) for i in chunk))
| |
− | return pages
| |
− | | |
− | def _list_content(applications):
| |
− | pageids = {}
| |
− | content = {}
| |
− | for category in applications.iterkeys():
| |
− | for application in applications[category]:
| |
− | if category in pageids:
| |
− | pageids[category] += [application["pageid"]]
| |
− | else:
| |
− | pageids[category] = [application["pageid"]]
| |
− | content[category] = get_pageid(pageids[category])
| |
− | return content
| |
− | | |
− | def download_content(applications):
| |
− | content = _list_content(applications)
| |
− | f = open('content','w+')
| |
− | json.dump(content,f)
| |
− | f.flush()
| |
− | return content
| |
− | | |
− | def list_content(applications):
| |
− | if os.path.isfile("content"):
| |
− | f = open('content','r')
| |
− | return json.load(f)
| |
− | return download_content(applications)
| |
− | | |
− | def parse_content(content):
| |
− | applications = {}
| |
− | for category in content.iterkeys():
| |
− | applications_for_category = []
| |
− | for application_content in content[category]:
| |
− | application = mediawiki_template(application_content["*"])
| |
− | if application["Eingereicht"] != "":
| |
− | applications_for_category.append(application)
| |
− | applications_for_category.sort(key = lambda a: a["Titel"])
| |
− | applications[category] = applications_for_category
| |
− | return applications
| |
− | | |
− | def mediawiki_template(mw_string):
| |
− | """ returns media wiki template element as a hash"""
| |
− | #Split content inside Template
| |
− | strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
| |
− | #remove "Antragsfabrikat"
| |
− | strings = strings[1:]
| |
− | mw_hash = {}
| |
− | for string in strings:
| |
− | keyval = string.split("=",1)
| |
− | if 2 != len(keyval):
| |
− | raise SyntaxError("Mediawiki parsing Error %s" % keyval)
| |
− | keyval = [s.strip() for s in keyval]
| |
− | key, val = keyval
| |
− | mw_hash[key] = val
| |
− | return mw_hash
| |
− | | |
− | def filter_content(content):
| |
− | """ simple filter for some html tags to plain text"""
| |
− | content = content.replace("<sup>1</sup>","¹")
| |
− | content = content.replace("<sup>2</sup>","²")
| |
− | content = content.replace("<sup>3</sup>","³")
| |
− | content = content.replace("<br>","\n")
| |
− | content = content.replace("<br\>","\n")
| |
− | content = content.replace("<br\\n>","\n")
| |
− | content = content.replace("<br />","\n")
| |
− | content = content.replace("<br/>","\n")
| |
− | return content
| |
− | | |
− | | |
− | def write_content(applications, applications_position=[]):
| |
− | open_position = []
| |
− | open_position.extend(applications_position)
| |
− | for category in applications:
| |
− | f = open(category,'w+')
| |
− | writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
| |
− | writer.writerow( ("Number","Title","Text","Reason",
| |
− | "Submitter (First Name)","Submitter (Last Name)"))
| |
− | for a in applications[category]:
| |
− | try:
| |
− | number = applications_position.index(a["Titel"]) + 1
| |
− | open_position.remove(a["Titel"])
| |
− | except ValueError:
| |
− | print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden'
| |
− | number = ""
| |
− | writer.writerow( ( number, # number starts at 1
| |
− | a["Titel"].encode('utf8'),
| |
− | filter_content(a["Antragstext"].encode('utf8')),
| |
− | filter_content(a[u'Begr\xfcndung'].encode('utf8')),
| |
− | a["Antragsteller"].encode('utf8'),
| |
− | "") ) #Last Name
| |
− | f.flush()
| |
− | f.close()
| |
− | if open_position != []:
| |
− | print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: "
| |
− | for a in open_position:
| |
− | print a
| |
− | | |
− | def get_application_positions(filename):
| |
− | f = open(filename,'r')
| |
− | lines = [l.strip().decode('utf8') for l in f.readlines()]
| |
− | return lines
| |
− | | |
− | if __name__ == '__main__':
| |
− | #download_applications(CATEGORIES)
| |
− | applications = list_applications(CATEGORIES)
| |
− | #download_content(applications)
| |
− | content = list_content(applications)
| |
− | applications = parse_content(content)
| |
− | #Ein Titel per Zeile, TO-Reihenfolge gegeben
| |
− | #positions = get_application_positions("reihenfolge-to")
| |
− | write_content(applications, positions)
| |