Unterstütze uns! Spende jetzt!

Benutzer:Hub/antragsfabrik-openslides-export.py: Unterschied zwischen den Versionen

Aus PiratenWiki
Zur Navigation springen Zur Suche springen
(Script zum scrappen von Anträgen aus dem BB Listing in openslides csv.)
 
(move to Github)
 
(Eine dazwischenliegende Version desselben Benutzers wird nicht angezeigt)
Zeile 1: Zeile 1:
  #!/usr/bin/env python
Script findet sich hier:
  # -*- coding: utf-8 -*-
https://gist.github.com/3488254
  import urllib2
  import json
  import os
  import csv
 
  API_URL = "http://wiki.piratenbrandenburg.de/api.php"
  CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
                "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
                "Kategorie:Programmantrag_AF_LPT_2012.1"]
  MAX_PAGEIDS = 50
 
  def get_json(endpoint):
    url = ''.join([
            API_URL,
            '?',
            endpoint,
            '&format=json',
            ])
    return urllib2.urlopen(url).read()
 
  def get_category(category, query_continue=""):
    data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
    json_data = json.loads(data)
    pages = json_data["query"]["categorymembers"]
    if "query-continue" in json_data:
      pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
    return pages
 
  def list_applications(categories):
    if os.path.isfile("application_list"):
      f = open('application_list','r')
      return json.load(f)
    return download_applications(categories)
 
  def download_applications(categories):
    applications = _list_applications(categories)
    f = open('application_list','w+')
    json.dump(applications, f)
    f.flush()
    return applications
 
  def _list_applications(categories):
    applications = {}
    for category in categories:
      pages = get_category(category)
      applications[category] = pages
    return applications
 
  def get_raw_pageid(pageid):
    data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
    json_data = json.loads(data)
    pages = json_data["query"]["pages"]
    content = []
    for pageids in pages:
      content += pages[pageids]["revisions"]
    return content
 
  def chunks(l, n):
    for i in xrange(0, len(l), n):
      yield l[i:i+n]
 
  def get_pageid(pageids):
    pages = []
    for chunk in chunks(pageids, MAX_PAGEIDS):
      pages += get_raw_pageid("|".join(str(i) for i in chunk))
    return pages
 
  def _list_content(applications):
    pageids = {}
    content = {}
    for category in applications.iterkeys():
      for application in applications[category]:
        if category in pageids:
          pageids[category] += [application["pageid"]]
        else:
          pageids[category] = [application["pageid"]]
      content[category] = get_pageid(pageids[category])
    return content
 
  def download_content(applications):
    content = _list_content(applications)
    f = open('content','w+')
    json.dump(content,f)
    f.flush()
    return content
 
  def list_content(applications):
    if os.path.isfile("content"):
      f = open('content','r')
      return json.load(f)
    return download_content(applications)
 
  def parse_content(content):
    applications = {}
    for category in content.iterkeys():
      applications_for_category = []
      for application_content in content[category]:
        application = mediawiki_template(application_content["*"])
        if application["Eingereicht"] != "":
          applications_for_category.append(application)
      applications_for_category.sort(key = lambda x: x["Titel"])
      applications[category] = applications_for_category
    return applications
 
  def mediawiki_template(mw_string):
    """ returns media wiki template element as a hash"""
    #Split content inside Template
    strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
    #remove "Antragsfabrikat"
    strings = strings[1:]
    mw_hash = {}
    for string in strings:
      keyval = string.split("=",1)
      if 2 != len(keyval):
        raise SyntaxError("Mediawiki parsing Error %s" % keyval)
      keyval = [s.strip() for s in keyval]
      key, val = keyval
      mw_hash[key] = val
    return mw_hash
 
  def filter_content(content):
    """ simple filter for some html tags to plain text"""
    content = content.replace("<sup>1</sup>","¹")
    content = content.replace("<sup>2</sup>","²")
    content = content.replace("<sup>3</sup>","³")
    content = content.replace("<br>","\n")
    content = content.replace("<br\>","\n")
    content = content.replace("<br\\n>","\n")
    content = content.replace("<br />","\n")
    content = content.replace("<br/>","\n")
    return content
 
 
  def write_content(applications):
    for category in applications:
      f = open(category,'w+')
      writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
      writer.writerow( ("Number","Title","Text","Reason",
                        "Submitter (First Name)","Submitter (Last Name)"))
      for a in applications[category]:
        writer.writerow( ( "",
                          a["Titel"].encode('utf8'),
                          filter_content(a["Antragstext"].encode('utf8')),
                          filter_content(a[u'Begr\xfcndung'].encode('utf8')),
                          a["Antragsteller"].encode('utf8'),
                          "") ) #Last Name
      f.flush()
      f.close()
 
  if __name__ == '__main__':
    #download_applications(CATEGORIES)
    applications = list_applications(CATEGORIES)
    #download_content(applications)
    content = list_content(applications)
    applications = parse_content(content)
    write_content(applications)

Aktuelle Version vom 27. August 2012, 13:05 Uhr

Script findet sich hier: https://gist.github.com/3488254