Benutzer:Hub/antragsfabrik-openslides-export.py: Unterschied zwischen den Versionen
Hub (Diskussion | Beiträge) (Script zum scrappen von Anträgen aus dem BB Listing in openslides csv.) |
Hub (Diskussion | Beiträge) (update Reihenfolge aus TO auslesen) |
||
Zeile 1: | Zeile 1: | ||
− | + | #!/usr/bin/env python | |
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||
import urllib2 | import urllib2 | ||
Zeile 101: | Zeile 101: | ||
if application["Eingereicht"] != "": | if application["Eingereicht"] != "": | ||
applications_for_category.append(application) | applications_for_category.append(application) | ||
− | applications_for_category.sort(key = lambda | + | applications_for_category.sort(key = lambda a: a["Titel"]) |
applications[category] = applications_for_category | applications[category] = applications_for_category | ||
return applications | return applications | ||
Zeile 134: | Zeile 134: | ||
− | def write_content(applications): | + | def write_content(applications, applications_position=[]): |
+ | open_position = [] | ||
+ | open_position.extend(applications_position) | ||
for category in applications: | for category in applications: | ||
f = open(category,'w+') | f = open(category,'w+') | ||
Zeile 141: | Zeile 143: | ||
"Submitter (First Name)","Submitter (Last Name)")) | "Submitter (First Name)","Submitter (Last Name)")) | ||
for a in applications[category]: | for a in applications[category]: | ||
− | writer.writerow( ( | + | try: |
+ | number = applications_position.index(a["Titel"]) + 1 | ||
+ | open_position.remove(a["Titel"]) | ||
+ | except ValueError: | ||
+ | print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden' | ||
+ | number = "" | ||
+ | writer.writerow( ( number, # number starts at 1 | ||
a["Titel"].encode('utf8'), | a["Titel"].encode('utf8'), | ||
filter_content(a["Antragstext"].encode('utf8')), | filter_content(a["Antragstext"].encode('utf8')), | ||
Zeile 149: | Zeile 157: | ||
f.flush() | f.flush() | ||
f.close() | f.close() | ||
+ | if open_position != []: | ||
+ | print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: " | ||
+ | for a in open_position: | ||
+ | print a | ||
+ | |||
+ | def get_application_positions(filename): | ||
+ | f = open(filename,'r') | ||
+ | lines = [l.strip().decode('utf8') for l in f.readlines()] | ||
+ | return lines | ||
if __name__ == '__main__': | if __name__ == '__main__': | ||
Zeile 156: | Zeile 173: | ||
content = list_content(applications) | content = list_content(applications) | ||
applications = parse_content(content) | applications = parse_content(content) | ||
− | write_content(applications) | + | #Ein Titel per Zeile, TO-Reihenfolge gegeben |
+ | #positions = get_application_positions("reihenfolge-to") | ||
+ | write_content(applications, positions) |
Version vom 20. Juni 2012, 12:06 Uhr
- !/usr/bin/env python
# -*- coding: utf-8 -*- import urllib2 import json import os import csv
API_URL = "http://wiki.piratenbrandenburg.de/api.php" CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1", "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1", "Kategorie:Programmantrag_AF_LPT_2012.1"] MAX_PAGEIDS = 50
def get_json(endpoint): url = .join([ API_URL, '?', endpoint, '&format=json', ]) return urllib2.urlopen(url).read()
def get_category(category, query_continue=""): data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue)) json_data = json.loads(data) pages = json_data["query"]["categorymembers"] if "query-continue" in json_data: pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"]) return pages
def list_applications(categories): if os.path.isfile("application_list"): f = open('application_list','r') return json.load(f) return download_applications(categories)
def download_applications(categories): applications = _list_applications(categories) f = open('application_list','w+') json.dump(applications, f) f.flush() return applications
def _list_applications(categories): applications = {} for category in categories: pages = get_category(category) applications[category] = pages return applications
def get_raw_pageid(pageid): data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid) json_data = json.loads(data) pages = json_data["query"]["pages"] content = [] for pageids in pages: content += pages[pageids]["revisions"] return content
def chunks(l, n): for i in xrange(0, len(l), n): yield l[i:i+n]
def get_pageid(pageids): pages = [] for chunk in chunks(pageids, MAX_PAGEIDS): pages += get_raw_pageid("|".join(str(i) for i in chunk)) return pages
def _list_content(applications): pageids = {} content = {} for category in applications.iterkeys(): for application in applications[category]: if category in pageids: pageids[category] += [application["pageid"]] else: pageids[category] = [application["pageid"]] content[category] = get_pageid(pageids[category]) return content
def download_content(applications): content = _list_content(applications) f = open('content','w+') json.dump(content,f) f.flush() return content
def list_content(applications): if os.path.isfile("content"): f = open('content','r') return json.load(f) return download_content(applications)
def parse_content(content): applications = {} for category in content.iterkeys(): applications_for_category = [] for application_content in content[category]: application = mediawiki_template(application_content["*"]) if application["Eingereicht"] != "": applications_for_category.append(application) applications_for_category.sort(key = lambda a: a["Titel"]) applications[category] = applications_for_category return applications
def mediawiki_template(mw_string): """ returns media wiki template element as a hash""" #Split content inside Template strings = mw_string.split("{{")[1].split("}}")[0].split("\n|") #remove "Antragsfabrikat" strings = strings[1:] mw_hash = {} for string in strings: keyval = string.split("=",1) if 2 != len(keyval): raise SyntaxError("Mediawiki parsing Error %s" % keyval) keyval = [s.strip() for s in keyval] key, val = keyval mw_hash[key] = val return mw_hash
def filter_content(content): """ simple filter for some html tags to plain text""" content = content.replace("1","¹") content = content.replace("2","²") content = content.replace("3","³") content = content.replace("
","\n") content = content.replace("<br\>","\n") content = content.replace("<br\\n>","\n") content = content.replace("
","\n") content = content.replace("
","\n") return content
def write_content(applications, applications_position=[]): open_position = [] open_position.extend(applications_position) for category in applications: f = open(category,'w+') writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow( ("Number","Title","Text","Reason", "Submitter (First Name)","Submitter (Last Name)")) for a in applications[category]: try: number = applications_position.index(a["Titel"]) + 1 open_position.remove(a["Titel"]) except ValueError: print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden' number = "" writer.writerow( ( number, # number starts at 1 a["Titel"].encode('utf8'), filter_content(a["Antragstext"].encode('utf8')), filter_content(a[u'Begr\xfcndung'].encode('utf8')), a["Antragsteller"].encode('utf8'), "") ) #Last Name f.flush() f.close() if open_position != []: print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: " for a in open_position: print a
def get_application_positions(filename): f = open(filename,'r') lines = [l.strip().decode('utf8') for l in f.readlines()] return lines
if __name__ == '__main__': #download_applications(CATEGORIES) applications = list_applications(CATEGORIES) #download_content(applications) content = list_content(applications) applications = parse_content(content) #Ein Titel per Zeile, TO-Reihenfolge gegeben #positions = get_application_positions("reihenfolge-to") write_content(applications, positions)