168
Bearbeitungen
Änderungen
Script zum scrappen von Anträgen aus dem BB Listing in openslides csv.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import json
import os
import csv
API_URL = "http://wiki.piratenbrandenburg.de/api.php"
CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
"Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
"Kategorie:Programmantrag_AF_LPT_2012.1"]
MAX_PAGEIDS = 50
def get_json(endpoint):
url = ''.join([
API_URL,
'?',
endpoint,
'&format=json',
])
return urllib2.urlopen(url).read()
def get_category(category, query_continue=""):
data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
json_data = json.loads(data)
pages = json_data["query"]["categorymembers"]
if "query-continue" in json_data:
pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
return pages
def list_applications(categories):
if os.path.isfile("application_list"):
f = open('application_list','r')
return json.load(f)
return download_applications(categories)
def download_applications(categories):
applications = _list_applications(categories)
f = open('application_list','w+')
json.dump(applications, f)
f.flush()
return applications
def _list_applications(categories):
applications = {}
for category in categories:
pages = get_category(category)
applications[category] = pages
return applications
def get_raw_pageid(pageid):
data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
json_data = json.loads(data)
pages = json_data["query"]["pages"]
content = []
for pageids in pages:
content += pages[pageids]["revisions"]
return content
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
def get_pageid(pageids):
pages = []
for chunk in chunks(pageids, MAX_PAGEIDS):
pages += get_raw_pageid("|".join(str(i) for i in chunk))
return pages
def _list_content(applications):
pageids = {}
content = {}
for category in applications.iterkeys():
for application in applications[category]:
if category in pageids:
pageids[category] += [application["pageid"]]
else:
pageids[category] = [application["pageid"]]
content[category] = get_pageid(pageids[category])
return content
def download_content(applications):
content = _list_content(applications)
f = open('content','w+')
json.dump(content,f)
f.flush()
return content
def list_content(applications):
if os.path.isfile("content"):
f = open('content','r')
return json.load(f)
return download_content(applications)
def parse_content(content):
applications = {}
for category in content.iterkeys():
applications_for_category = []
for application_content in content[category]:
application = mediawiki_template(application_content["*"])
if application["Eingereicht"] != "":
applications_for_category.append(application)
applications_for_category.sort(key = lambda x: x["Titel"])
applications[category] = applications_for_category
return applications
def mediawiki_template(mw_string):
""" returns media wiki template element as a hash"""
#Split content inside Template
strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
#remove "Antragsfabrikat"
strings = strings[1:]
mw_hash = {}
for string in strings:
keyval = string.split("=",1)
if 2 != len(keyval):
raise SyntaxError("Mediawiki parsing Error %s" % keyval)
keyval = [s.strip() for s in keyval]
key, val = keyval
mw_hash[key] = val
return mw_hash
def filter_content(content):
""" simple filter for some html tags to plain text"""
content = content.replace("<sup>1</sup>","¹")
content = content.replace("<sup>2</sup>","²")
content = content.replace("<sup>3</sup>","³")
content = content.replace("<br>","\n")
content = content.replace("<br\>","\n")
content = content.replace("<br\\n>","\n")
content = content.replace("<br />","\n")
content = content.replace("<br/>","\n")
return content
def write_content(applications):
for category in applications:
f = open(category,'w+')
writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow( ("Number","Title","Text","Reason",
"Submitter (First Name)","Submitter (Last Name)"))
for a in applications[category]:
writer.writerow( ( "",
a["Titel"].encode('utf8'),
filter_content(a["Antragstext"].encode('utf8')),
filter_content(a[u'Begr\xfcndung'].encode('utf8')),
a["Antragsteller"].encode('utf8'),
"") ) #Last Name
f.flush()
f.close()
if __name__ == '__main__':
#download_applications(CATEGORIES)
applications = list_applications(CATEGORIES)
#download_content(applications)
content = list_content(applications)
applications = parse_content(content)
write_content(applications)
# -*- coding: utf-8 -*-
import urllib2
import json
import os
import csv
API_URL = "http://wiki.piratenbrandenburg.de/api.php"
CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
"Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
"Kategorie:Programmantrag_AF_LPT_2012.1"]
MAX_PAGEIDS = 50
def get_json(endpoint):
url = ''.join([
API_URL,
'?',
endpoint,
'&format=json',
])
return urllib2.urlopen(url).read()
def get_category(category, query_continue=""):
data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
json_data = json.loads(data)
pages = json_data["query"]["categorymembers"]
if "query-continue" in json_data:
pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
return pages
def list_applications(categories):
if os.path.isfile("application_list"):
f = open('application_list','r')
return json.load(f)
return download_applications(categories)
def download_applications(categories):
applications = _list_applications(categories)
f = open('application_list','w+')
json.dump(applications, f)
f.flush()
return applications
def _list_applications(categories):
applications = {}
for category in categories:
pages = get_category(category)
applications[category] = pages
return applications
def get_raw_pageid(pageid):
data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
json_data = json.loads(data)
pages = json_data["query"]["pages"]
content = []
for pageids in pages:
content += pages[pageids]["revisions"]
return content
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
def get_pageid(pageids):
pages = []
for chunk in chunks(pageids, MAX_PAGEIDS):
pages += get_raw_pageid("|".join(str(i) for i in chunk))
return pages
def _list_content(applications):
pageids = {}
content = {}
for category in applications.iterkeys():
for application in applications[category]:
if category in pageids:
pageids[category] += [application["pageid"]]
else:
pageids[category] = [application["pageid"]]
content[category] = get_pageid(pageids[category])
return content
def download_content(applications):
content = _list_content(applications)
f = open('content','w+')
json.dump(content,f)
f.flush()
return content
def list_content(applications):
if os.path.isfile("content"):
f = open('content','r')
return json.load(f)
return download_content(applications)
def parse_content(content):
applications = {}
for category in content.iterkeys():
applications_for_category = []
for application_content in content[category]:
application = mediawiki_template(application_content["*"])
if application["Eingereicht"] != "":
applications_for_category.append(application)
applications_for_category.sort(key = lambda x: x["Titel"])
applications[category] = applications_for_category
return applications
def mediawiki_template(mw_string):
""" returns media wiki template element as a hash"""
#Split content inside Template
strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
#remove "Antragsfabrikat"
strings = strings[1:]
mw_hash = {}
for string in strings:
keyval = string.split("=",1)
if 2 != len(keyval):
raise SyntaxError("Mediawiki parsing Error %s" % keyval)
keyval = [s.strip() for s in keyval]
key, val = keyval
mw_hash[key] = val
return mw_hash
def filter_content(content):
""" simple filter for some html tags to plain text"""
content = content.replace("<sup>1</sup>","¹")
content = content.replace("<sup>2</sup>","²")
content = content.replace("<sup>3</sup>","³")
content = content.replace("<br>","\n")
content = content.replace("<br\>","\n")
content = content.replace("<br\\n>","\n")
content = content.replace("<br />","\n")
content = content.replace("<br/>","\n")
return content
def write_content(applications):
for category in applications:
f = open(category,'w+')
writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow( ("Number","Title","Text","Reason",
"Submitter (First Name)","Submitter (Last Name)"))
for a in applications[category]:
writer.writerow( ( "",
a["Titel"].encode('utf8'),
filter_content(a["Antragstext"].encode('utf8')),
filter_content(a[u'Begr\xfcndung'].encode('utf8')),
a["Antragsteller"].encode('utf8'),
"") ) #Last Name
f.flush()
f.close()
if __name__ == '__main__':
#download_applications(CATEGORIES)
applications = list_applications(CATEGORIES)
#download_content(applications)
content = list_content(applications)
applications = parse_content(content)
write_content(applications)