Unterstütze uns! Spende jetzt!

Benutzer:Hub/antragsfabrik-openslides-export.py: Unterschied zwischen den Versionen

Aus PiratenWiki
Zur Navigation springen Zur Suche springen
(Script zum scrappen von Anträgen aus dem BB Listing in openslides csv.)
(kein Unterschied)

Version vom 15. Juni 2012, 13:15 Uhr

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import urllib2
 import json
 import os 
 import csv
 API_URL = "http://wiki.piratenbrandenburg.de/api.php"
 CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
               "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
               "Kategorie:Programmantrag_AF_LPT_2012.1"]
 MAX_PAGEIDS = 50
 def get_json(endpoint):
   url = .join([
            API_URL,
            '?',
            endpoint,
            '&format=json',
            ])
   return urllib2.urlopen(url).read()
 def get_category(category, query_continue=""):
   data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
   json_data = json.loads(data)
   pages = json_data["query"]["categorymembers"]
   if "query-continue" in json_data:
     pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
   return pages
 def list_applications(categories):
   if os.path.isfile("application_list"):
     f = open('application_list','r')
     return json.load(f)
   return download_applications(categories)
 def download_applications(categories):
   applications = _list_applications(categories)
   f = open('application_list','w+')
   json.dump(applications, f)
   f.flush()
   return applications
 def _list_applications(categories):
   applications = {}
   for category in categories:
     pages = get_category(category)
     applications[category] = pages
   return applications
 def get_raw_pageid(pageid):
   data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
   json_data = json.loads(data)
   pages = json_data["query"]["pages"]
   content = []
   for pageids in pages:
     content += pages[pageids]["revisions"]
   return content
 def chunks(l, n):
   for i in xrange(0, len(l), n):
     yield l[i:i+n]
 def get_pageid(pageids):
   pages = []
   for chunk in chunks(pageids, MAX_PAGEIDS):
     pages += get_raw_pageid("|".join(str(i) for i in chunk))
   return pages
 def _list_content(applications):
   pageids = {}
   content = {}
   for category in applications.iterkeys():
     for application in applications[category]:
       if category in pageids:
         pageids[category] += [application["pageid"]]
       else:
         pageids[category] = [application["pageid"]]
     content[category] = get_pageid(pageids[category])
   return content
 def download_content(applications):
   content = _list_content(applications)
   f = open('content','w+')
   json.dump(content,f)
   f.flush()
   return content
 def list_content(applications):
   if os.path.isfile("content"):
     f = open('content','r')
     return json.load(f)
   return download_content(applications)
 def parse_content(content):
   applications = {}
   for category in content.iterkeys():
     applications_for_category = []
     for application_content in content[category]:
       application = mediawiki_template(application_content["*"])
       if application["Eingereicht"] != "":
         applications_for_category.append(application)
     applications_for_category.sort(key = lambda x: x["Titel"])
     applications[category] = applications_for_category
   return applications
 def mediawiki_template(mw_string):
   """ returns media wiki template element as a hash"""
   #Split content inside Template
   strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
   #remove "Antragsfabrikat"
   strings = strings[1:]
   mw_hash = {}
   for string in strings:
     keyval = string.split("=",1)
     if 2 != len(keyval):
       raise SyntaxError("Mediawiki parsing Error %s" % keyval)
     keyval = [s.strip() for s in keyval]
     key, val = keyval
     mw_hash[key] = val
   return mw_hash
 def filter_content(content):
   """ simple filter for some html tags to plain text"""
   content = content.replace("1","¹")
   content = content.replace("2","²")
   content = content.replace("3","³")
   content = content.replace("
","\n") content = content.replace("<br\>","\n") content = content.replace("<br\\n>","\n") content = content.replace("
","\n") content = content.replace("
","\n") return content


 def write_content(applications):
   for category in applications:
     f = open(category,'w+')
     writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
     writer.writerow( ("Number","Title","Text","Reason",
                       "Submitter (First Name)","Submitter (Last Name)"))
     for a in applications[category]:
       writer.writerow( ( "",
                         a["Titel"].encode('utf8'),
                         filter_content(a["Antragstext"].encode('utf8')),
                         filter_content(a[u'Begr\xfcndung'].encode('utf8')),
                         a["Antragsteller"].encode('utf8'),
                         "") ) #Last Name
     f.flush()
     f.close()
 if __name__ == '__main__':
   #download_applications(CATEGORIES)
   applications = list_applications(CATEGORIES)
   #download_content(applications)
   content = list_content(applications)
   applications = parse_content(content)
   write_content(applications)