Unterstütze uns! Spende jetzt!

Benutzer:Hub/antragsfabrik-openslides-export.py: Unterschied zwischen den Versionen

Aus PiratenWiki
Zur Navigation springen Zur Suche springen
(update Reihenfolge aus TO auslesen)
(move to Github)
 
Zeile 1: Zeile 1:
#!/usr/bin/env python
Script findet sich hier:
  # -*- coding: utf-8 -*-
https://gist.github.com/3488254
  import urllib2
  import json
  import os
  import csv
 
  API_URL = "http://wiki.piratenbrandenburg.de/api.php"
  CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
                "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
                "Kategorie:Programmantrag_AF_LPT_2012.1"]
  MAX_PAGEIDS = 50
 
  def get_json(endpoint):
    url = ''.join([
            API_URL,
            '?',
            endpoint,
            '&format=json',
            ])
    return urllib2.urlopen(url).read()
 
  def get_category(category, query_continue=""):
    data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
    json_data = json.loads(data)
    pages = json_data["query"]["categorymembers"]
    if "query-continue" in json_data:
      pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
    return pages
 
  def list_applications(categories):
    if os.path.isfile("application_list"):
      f = open('application_list','r')
      return json.load(f)
    return download_applications(categories)
 
  def download_applications(categories):
    applications = _list_applications(categories)
    f = open('application_list','w+')
    json.dump(applications, f)
    f.flush()
    return applications
 
  def _list_applications(categories):
    applications = {}
    for category in categories:
      pages = get_category(category)
      applications[category] = pages
    return applications
 
  def get_raw_pageid(pageid):
    data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
    json_data = json.loads(data)
    pages = json_data["query"]["pages"]
    content = []
    for pageids in pages:
      content += pages[pageids]["revisions"]
    return content
 
  def chunks(l, n):
    for i in xrange(0, len(l), n):
      yield l[i:i+n]
 
  def get_pageid(pageids):
    pages = []
    for chunk in chunks(pageids, MAX_PAGEIDS):
      pages += get_raw_pageid("|".join(str(i) for i in chunk))
    return pages
 
  def _list_content(applications):
    pageids = {}
    content = {}
    for category in applications.iterkeys():
      for application in applications[category]:
        if category in pageids:
          pageids[category] += [application["pageid"]]
        else:
          pageids[category] = [application["pageid"]]
      content[category] = get_pageid(pageids[category])
    return content
 
  def download_content(applications):
    content = _list_content(applications)
    f = open('content','w+')
    json.dump(content,f)
    f.flush()
    return content
 
  def list_content(applications):
    if os.path.isfile("content"):
      f = open('content','r')
      return json.load(f)
    return download_content(applications)
 
  def parse_content(content):
    applications = {}
    for category in content.iterkeys():
      applications_for_category = []
      for application_content in content[category]:
        application = mediawiki_template(application_content["*"])
        if application["Eingereicht"] != "":
          applications_for_category.append(application)
      applications_for_category.sort(key = lambda a: a["Titel"])
      applications[category] = applications_for_category
    return applications
 
  def mediawiki_template(mw_string):
    """ returns media wiki template element as a hash"""
    #Split content inside Template
    strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
    #remove "Antragsfabrikat"
    strings = strings[1:]
    mw_hash = {}
    for string in strings:
      keyval = string.split("=",1)
      if 2 != len(keyval):
        raise SyntaxError("Mediawiki parsing Error %s" % keyval)
      keyval = [s.strip() for s in keyval]
      key, val = keyval
      mw_hash[key] = val
    return mw_hash
 
  def filter_content(content):
    """ simple filter for some html tags to plain text"""
    content = content.replace("<sup>1</sup>","¹")
    content = content.replace("<sup>2</sup>","²")
    content = content.replace("<sup>3</sup>","³")
    content = content.replace("<br>","\n")
    content = content.replace("<br\>","\n")
    content = content.replace("<br\\n>","\n")
    content = content.replace("<br />","\n")
    content = content.replace("<br/>","\n")
    return content
 
 
  def write_content(applications, applications_position=[]):
    open_position = []
    open_position.extend(applications_position)
    for category in applications:
      f = open(category,'w+')
      writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
      writer.writerow( ("Number","Title","Text","Reason",
                        "Submitter (First Name)","Submitter (Last Name)"))
      for a in applications[category]:
        try:
          number = applications_position.index(a["Titel"]) + 1
          open_position.remove(a["Titel"])
        except ValueError:
          print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden'
          number = ""
        writer.writerow( ( number, # number starts at 1
                          a["Titel"].encode('utf8'),
                          filter_content(a["Antragstext"].encode('utf8')),
                          filter_content(a[u'Begr\xfcndung'].encode('utf8')),
                          a["Antragsteller"].encode('utf8'),
                          "") ) #Last Name
      f.flush()
      f.close()
    if open_position != []:
      print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: "
      for a in open_position:
        print a
 
  def get_application_positions(filename):
    f = open(filename,'r')
    lines = [l.strip().decode('utf8') for l in f.readlines()]
    return lines
 
  if __name__ == '__main__':
    #download_applications(CATEGORIES)
    applications = list_applications(CATEGORIES)
    #download_content(applications)
    content = list_content(applications)
    applications = parse_content(content)
    #Ein Titel per Zeile, TO-Reihenfolge gegeben
    #positions = get_application_positions("reihenfolge-to")
    write_content(applications, positions)

Aktuelle Version vom 27. August 2012, 13:05 Uhr

Script findet sich hier: https://gist.github.com/3488254