Unterstütze uns! Spende jetzt!

Benutzer:Hub/antragsfabrik-openslides-export.py

Aus PiratenWiki
< Benutzer:Hub
Version vom 15. Juni 2012, 15:15 Uhr von Hub (Diskussion | Beiträge) (Script zum scrappen von Anträgen aus dem BB Listing in openslides csv.)
(Unterschied) ← Nächstältere Version | Aktuelle Version (Unterschied) | Nächstjüngere Version → (Unterschied)
Wechseln zu: Navigation, Suche
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import urllib2
 import json
 import os 
 import csv
 API_URL = "http://wiki.piratenbrandenburg.de/api.php"
 CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
               "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
               "Kategorie:Programmantrag_AF_LPT_2012.1"]
 MAX_PAGEIDS = 50
 def get_json(endpoint):
   url = .join([
            API_URL,
            '?',
            endpoint,
            '&format=json',
            ])
   return urllib2.urlopen(url).read()
 def get_category(category, query_continue=""):
   data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
   json_data = json.loads(data)
   pages = json_data["query"]["categorymembers"]
   if "query-continue" in json_data:
     pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
   return pages
 def list_applications(categories):
   if os.path.isfile("application_list"):
     f = open('application_list','r')
     return json.load(f)
   return download_applications(categories)
 def download_applications(categories):
   applications = _list_applications(categories)
   f = open('application_list','w+')
   json.dump(applications, f)
   f.flush()
   return applications
 def _list_applications(categories):
   applications = {}
   for category in categories:
     pages = get_category(category)
     applications[category] = pages
   return applications
 def get_raw_pageid(pageid):
   data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
   json_data = json.loads(data)
   pages = json_data["query"]["pages"]
   content = []
   for pageids in pages:
     content += pages[pageids]["revisions"]
   return content
 def chunks(l, n):
   for i in xrange(0, len(l), n):
     yield l[i:i+n]
 def get_pageid(pageids):
   pages = []
   for chunk in chunks(pageids, MAX_PAGEIDS):
     pages += get_raw_pageid("|".join(str(i) for i in chunk))
   return pages
 def _list_content(applications):
   pageids = {}
   content = {}
   for category in applications.iterkeys():
     for application in applications[category]:
       if category in pageids:
         pageids[category] += [application["pageid"]]
       else:
         pageids[category] = [application["pageid"]]
     content[category] = get_pageid(pageids[category])
   return content
 def download_content(applications):
   content = _list_content(applications)
   f = open('content','w+')
   json.dump(content,f)
   f.flush()
   return content
 def list_content(applications):
   if os.path.isfile("content"):
     f = open('content','r')
     return json.load(f)
   return download_content(applications)
 def parse_content(content):
   applications = {}
   for category in content.iterkeys():
     applications_for_category = []
     for application_content in content[category]:
       application = mediawiki_template(application_content["*"])
       if application["Eingereicht"] != "":
         applications_for_category.append(application)
     applications_for_category.sort(key = lambda x: x["Titel"])
     applications[category] = applications_for_category
   return applications
 def mediawiki_template(mw_string):
   """ returns media wiki template element as a hash"""
   #Split content inside Template
   strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
   #remove "Antragsfabrikat"
   strings = strings[1:]
   mw_hash = {}
   for string in strings:
     keyval = string.split("=",1)
     if 2 != len(keyval):
       raise SyntaxError("Mediawiki parsing Error %s" % keyval)
     keyval = [s.strip() for s in keyval]
     key, val = keyval
     mw_hash[key] = val
   return mw_hash
 def filter_content(content):
   """ simple filter for some html tags to plain text"""
   content = content.replace("1","¹")
   content = content.replace("2","²")
   content = content.replace("3","³")
   content = content.replace("
","\n") content = content.replace("<br\>","\n") content = content.replace("<br\\n>","\n") content = content.replace("
","\n") content = content.replace("
","\n") return content


 def write_content(applications):
   for category in applications:
     f = open(category,'w+')
     writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
     writer.writerow( ("Number","Title","Text","Reason",
                       "Submitter (First Name)","Submitter (Last Name)"))
     for a in applications[category]:
       writer.writerow( ( "",
                         a["Titel"].encode('utf8'),
                         filter_content(a["Antragstext"].encode('utf8')),
                         filter_content(a[u'Begr\xfcndung'].encode('utf8')),
                         a["Antragsteller"].encode('utf8'),
                         "") ) #Last Name
     f.flush()
     f.close()
 if __name__ == '__main__':
   #download_applications(CATEGORIES)
   applications = list_applications(CATEGORIES)
   #download_content(applications)
   content = list_content(applications)
   applications = parse_content(content)
   write_content(applications)