Benutzer:Hub/antragsfabrik-openslides-export.py: Unterschied zwischen den Versionen
Zur Navigation springen
Zur Suche springen
Hub (Diskussion | Beiträge) (Script zum scrappen von Anträgen aus dem BB Listing in openslides csv.) |
Hub (Diskussion | Beiträge) (update Reihenfolge aus TO auslesen) |
||
| Zeile 1: | Zeile 1: | ||
#!/usr/bin/env python | |||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||
import urllib2 | import urllib2 | ||
| Zeile 101: | Zeile 101: | ||
if application["Eingereicht"] != "": | if application["Eingereicht"] != "": | ||
applications_for_category.append(application) | applications_for_category.append(application) | ||
applications_for_category.sort(key = lambda | applications_for_category.sort(key = lambda a: a["Titel"]) | ||
applications[category] = applications_for_category | applications[category] = applications_for_category | ||
return applications | return applications | ||
| Zeile 134: | Zeile 134: | ||
def write_content(applications): | def write_content(applications, applications_position=[]): | ||
open_position = [] | |||
open_position.extend(applications_position) | |||
for category in applications: | for category in applications: | ||
f = open(category,'w+') | f = open(category,'w+') | ||
| Zeile 141: | Zeile 143: | ||
"Submitter (First Name)","Submitter (Last Name)")) | "Submitter (First Name)","Submitter (Last Name)")) | ||
for a in applications[category]: | for a in applications[category]: | ||
writer.writerow( ( | try: | ||
number = applications_position.index(a["Titel"]) + 1 | |||
open_position.remove(a["Titel"]) | |||
except ValueError: | |||
print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden' | |||
number = "" | |||
writer.writerow( ( number, # number starts at 1 | |||
a["Titel"].encode('utf8'), | a["Titel"].encode('utf8'), | ||
filter_content(a["Antragstext"].encode('utf8')), | filter_content(a["Antragstext"].encode('utf8')), | ||
| Zeile 149: | Zeile 157: | ||
f.flush() | f.flush() | ||
f.close() | f.close() | ||
if open_position != []: | |||
print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: " | |||
for a in open_position: | |||
print a | |||
def get_application_positions(filename): | |||
f = open(filename,'r') | |||
lines = [l.strip().decode('utf8') for l in f.readlines()] | |||
return lines | |||
if __name__ == '__main__': | if __name__ == '__main__': | ||
| Zeile 156: | Zeile 173: | ||
content = list_content(applications) | content = list_content(applications) | ||
applications = parse_content(content) | applications = parse_content(content) | ||
write_content(applications) | #Ein Titel per Zeile, TO-Reihenfolge gegeben | ||
#positions = get_application_positions("reihenfolge-to") | |||
write_content(applications, positions) | |||
Version vom 20. Juni 2012, 10:06 Uhr
- !/usr/bin/env python
# -*- coding: utf-8 -*- import urllib2 import json import os import csv
API_URL = "http://wiki.piratenbrandenburg.de/api.php" CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1", "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1", "Kategorie:Programmantrag_AF_LPT_2012.1"] MAX_PAGEIDS = 50
def get_json(endpoint):
url = .join([
API_URL,
'?',
endpoint,
'&format=json',
])
return urllib2.urlopen(url).read()
def get_category(category, query_continue=""):
data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
json_data = json.loads(data)
pages = json_data["query"]["categorymembers"]
if "query-continue" in json_data:
pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
return pages
def list_applications(categories):
if os.path.isfile("application_list"):
f = open('application_list','r')
return json.load(f)
return download_applications(categories)
def download_applications(categories):
applications = _list_applications(categories)
f = open('application_list','w+')
json.dump(applications, f)
f.flush()
return applications
def _list_applications(categories):
applications = {}
for category in categories:
pages = get_category(category)
applications[category] = pages
return applications
def get_raw_pageid(pageid):
data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
json_data = json.loads(data)
pages = json_data["query"]["pages"]
content = []
for pageids in pages:
content += pages[pageids]["revisions"]
return content
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
def get_pageid(pageids):
pages = []
for chunk in chunks(pageids, MAX_PAGEIDS):
pages += get_raw_pageid("|".join(str(i) for i in chunk))
return pages
def _list_content(applications):
pageids = {}
content = {}
for category in applications.iterkeys():
for application in applications[category]:
if category in pageids:
pageids[category] += [application["pageid"]]
else:
pageids[category] = [application["pageid"]]
content[category] = get_pageid(pageids[category])
return content
def download_content(applications):
content = _list_content(applications)
f = open('content','w+')
json.dump(content,f)
f.flush()
return content
def list_content(applications):
if os.path.isfile("content"):
f = open('content','r')
return json.load(f)
return download_content(applications)
def parse_content(content):
applications = {}
for category in content.iterkeys():
applications_for_category = []
for application_content in content[category]:
application = mediawiki_template(application_content["*"])
if application["Eingereicht"] != "":
applications_for_category.append(application)
applications_for_category.sort(key = lambda a: a["Titel"])
applications[category] = applications_for_category
return applications
def mediawiki_template(mw_string):
""" returns media wiki template element as a hash"""
#Split content inside Template
strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
#remove "Antragsfabrikat"
strings = strings[1:]
mw_hash = {}
for string in strings:
keyval = string.split("=",1)
if 2 != len(keyval):
raise SyntaxError("Mediawiki parsing Error %s" % keyval)
keyval = [s.strip() for s in keyval]
key, val = keyval
mw_hash[key] = val
return mw_hash
def filter_content(content):
""" simple filter for some html tags to plain text"""
content = content.replace("1","¹")
content = content.replace("2","²")
content = content.replace("3","³")
content = content.replace("
","\n")
content = content.replace("<br\>","\n")
content = content.replace("<br\\n>","\n")
content = content.replace("
","\n")
content = content.replace("
","\n")
return content
def write_content(applications, applications_position=[]):
open_position = []
open_position.extend(applications_position)
for category in applications:
f = open(category,'w+')
writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow( ("Number","Title","Text","Reason",
"Submitter (First Name)","Submitter (Last Name)"))
for a in applications[category]:
try:
number = applications_position.index(a["Titel"]) + 1
open_position.remove(a["Titel"])
except ValueError:
print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden'
number = ""
writer.writerow( ( number, # number starts at 1
a["Titel"].encode('utf8'),
filter_content(a["Antragstext"].encode('utf8')),
filter_content(a[u'Begr\xfcndung'].encode('utf8')),
a["Antragsteller"].encode('utf8'),
"") ) #Last Name
f.flush()
f.close()
if open_position != []:
print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: "
for a in open_position:
print a
def get_application_positions(filename):
f = open(filename,'r')
lines = [l.strip().decode('utf8') for l in f.readlines()]
return lines
if __name__ == '__main__':
#download_applications(CATEGORIES)
applications = list_applications(CATEGORIES)
#download_content(applications)
content = list_content(applications)
applications = parse_content(content)
#Ein Titel per Zeile, TO-Reihenfolge gegeben
#positions = get_application_positions("reihenfolge-to")
write_content(applications, positions)
