Unterstütze uns! Spende jetzt!

Benutzer:Hub/antragsfabrik-openslides-export.py: Unterschied zwischen den Versionen

Aus PiratenWiki
Wechseln zu: Navigation, Suche
(update Reihenfolge aus TO auslesen)
(move to Github)
 
Zeile 1: Zeile 1:
#!/usr/bin/env python
+
Script findet sich hier:
  # -*- coding: utf-8 -*-
+
https://gist.github.com/3488254
  import urllib2
 
  import json
 
  import os
 
  import csv
 
 
 
  API_URL = "http://wiki.piratenbrandenburg.de/api.php"
 
  CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
 
                "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
 
                "Kategorie:Programmantrag_AF_LPT_2012.1"]
 
  MAX_PAGEIDS = 50
 
 
 
  def get_json(endpoint):
 
    url = ''.join([
 
            API_URL,
 
            '?',
 
            endpoint,
 
            '&format=json',
 
            ])
 
    return urllib2.urlopen(url).read()
 
 
 
  def get_category(category, query_continue=""):
 
    data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
 
    json_data = json.loads(data)
 
    pages = json_data["query"]["categorymembers"]
 
    if "query-continue" in json_data:
 
      pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
 
    return pages
 
 
 
  def list_applications(categories):
 
    if os.path.isfile("application_list"):
 
      f = open('application_list','r')
 
      return json.load(f)
 
    return download_applications(categories)
 
 
 
  def download_applications(categories):
 
    applications = _list_applications(categories)
 
    f = open('application_list','w+')
 
    json.dump(applications, f)
 
    f.flush()
 
    return applications
 
 
 
  def _list_applications(categories):
 
    applications = {}
 
    for category in categories:
 
      pages = get_category(category)
 
      applications[category] = pages
 
    return applications
 
 
 
  def get_raw_pageid(pageid):
 
    data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
 
    json_data = json.loads(data)
 
    pages = json_data["query"]["pages"]
 
    content = []
 
    for pageids in pages:
 
      content += pages[pageids]["revisions"]
 
    return content
 
 
 
  def chunks(l, n):
 
    for i in xrange(0, len(l), n):
 
      yield l[i:i+n]
 
 
 
  def get_pageid(pageids):
 
    pages = []
 
    for chunk in chunks(pageids, MAX_PAGEIDS):
 
      pages += get_raw_pageid("|".join(str(i) for i in chunk))
 
    return pages
 
 
 
  def _list_content(applications):
 
    pageids = {}
 
    content = {}
 
    for category in applications.iterkeys():
 
      for application in applications[category]:
 
        if category in pageids:
 
          pageids[category] += [application["pageid"]]
 
        else:
 
          pageids[category] = [application["pageid"]]
 
      content[category] = get_pageid(pageids[category])
 
    return content
 
 
 
  def download_content(applications):
 
    content = _list_content(applications)
 
    f = open('content','w+')
 
    json.dump(content,f)
 
    f.flush()
 
    return content
 
 
 
  def list_content(applications):
 
    if os.path.isfile("content"):
 
      f = open('content','r')
 
      return json.load(f)
 
    return download_content(applications)
 
 
 
  def parse_content(content):
 
    applications = {}
 
    for category in content.iterkeys():
 
      applications_for_category = []
 
      for application_content in content[category]:
 
        application = mediawiki_template(application_content["*"])
 
        if application["Eingereicht"] != "":
 
          applications_for_category.append(application)
 
      applications_for_category.sort(key = lambda a: a["Titel"])
 
      applications[category] = applications_for_category
 
    return applications
 
 
 
  def mediawiki_template(mw_string):
 
    """ returns media wiki template element as a hash"""
 
    #Split content inside Template
 
    strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
 
    #remove "Antragsfabrikat"
 
    strings = strings[1:]
 
    mw_hash = {}
 
    for string in strings:
 
      keyval = string.split("=",1)
 
      if 2 != len(keyval):
 
        raise SyntaxError("Mediawiki parsing Error %s" % keyval)
 
      keyval = [s.strip() for s in keyval]
 
      key, val = keyval
 
      mw_hash[key] = val
 
    return mw_hash
 
 
 
  def filter_content(content):
 
    """ simple filter for some html tags to plain text"""
 
    content = content.replace("<sup>1</sup>","¹")
 
    content = content.replace("<sup>2</sup>","²")
 
    content = content.replace("<sup>3</sup>","³")
 
    content = content.replace("<br>","\n")
 
    content = content.replace("<br\>","\n")
 
    content = content.replace("<br\\n>","\n")
 
    content = content.replace("<br />","\n")
 
    content = content.replace("<br/>","\n")
 
    return content
 
 
 
 
 
  def write_content(applications, applications_position=[]):
 
    open_position = []
 
    open_position.extend(applications_position)
 
    for category in applications:
 
      f = open(category,'w+')
 
      writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
 
      writer.writerow( ("Number","Title","Text","Reason",
 
                        "Submitter (First Name)","Submitter (Last Name)"))
 
      for a in applications[category]:
 
        try:
 
          number = applications_position.index(a["Titel"]) + 1
 
          open_position.remove(a["Titel"])
 
        except ValueError:
 
          print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden'
 
          number = ""
 
        writer.writerow( ( number, # number starts at 1
 
                          a["Titel"].encode('utf8'),
 
                          filter_content(a["Antragstext"].encode('utf8')),
 
                          filter_content(a[u'Begr\xfcndung'].encode('utf8')),
 
                          a["Antragsteller"].encode('utf8'),
 
                          "") ) #Last Name
 
      f.flush()
 
      f.close()
 
    if open_position != []:
 
      print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: "
 
      for a in open_position:
 
        print a
 
 
 
  def get_application_positions(filename):
 
    f = open(filename,'r')
 
    lines = [l.strip().decode('utf8') for l in f.readlines()]
 
    return lines
 
 
 
  if __name__ == '__main__':
 
    #download_applications(CATEGORIES)
 
    applications = list_applications(CATEGORIES)
 
    #download_content(applications)
 
    content = list_content(applications)
 
    applications = parse_content(content)
 
    #Ein Titel per Zeile, TO-Reihenfolge gegeben
 
    #positions = get_application_positions("reihenfolge-to")
 
    write_content(applications, positions)
 

Aktuelle Version vom 27. August 2012, 15:05 Uhr

Script findet sich hier: https://gist.github.com/3488254