Unterstütze uns! Spende jetzt!

Benutzer:Hub/antragsfabrik-openslides-export.py: Unterschied zwischen den Versionen

Aus PiratenWiki
Wechseln zu: Navigation, Suche
(Script zum scrappen von Anträgen aus dem BB Listing in openslides csv.)
 
(update Reihenfolge aus TO auslesen)
Zeile 1: Zeile 1:
  #!/usr/bin/env python
+
#!/usr/bin/env python
 
   # -*- coding: utf-8 -*-
 
   # -*- coding: utf-8 -*-
 
   import urllib2
 
   import urllib2
Zeile 101: Zeile 101:
 
         if application["Eingereicht"] != "":
 
         if application["Eingereicht"] != "":
 
           applications_for_category.append(application)
 
           applications_for_category.append(application)
       applications_for_category.sort(key = lambda x: x["Titel"])
+
       applications_for_category.sort(key = lambda a: a["Titel"])
 
       applications[category] = applications_for_category
 
       applications[category] = applications_for_category
 
     return applications
 
     return applications
Zeile 134: Zeile 134:
  
  
   def write_content(applications):
+
   def write_content(applications, applications_position=[]):
 +
    open_position = []
 +
    open_position.extend(applications_position)
 
     for category in applications:
 
     for category in applications:
 
       f = open(category,'w+')
 
       f = open(category,'w+')
Zeile 141: Zeile 143:
 
                         "Submitter (First Name)","Submitter (Last Name)"))
 
                         "Submitter (First Name)","Submitter (Last Name)"))
 
       for a in applications[category]:
 
       for a in applications[category]:
         writer.writerow( ( "",
+
        try:
 +
          number = applications_position.index(a["Titel"]) + 1
 +
          open_position.remove(a["Titel"])
 +
        except ValueError:
 +
          print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden'
 +
          number = ""
 +
         writer.writerow( ( number, # number starts at 1
 
                           a["Titel"].encode('utf8'),
 
                           a["Titel"].encode('utf8'),
 
                           filter_content(a["Antragstext"].encode('utf8')),
 
                           filter_content(a["Antragstext"].encode('utf8')),
Zeile 149: Zeile 157:
 
       f.flush()
 
       f.flush()
 
       f.close()
 
       f.close()
 +
    if open_position != []:
 +
      print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: "
 +
      for a in open_position:
 +
        print a
 +
 +
  def get_application_positions(filename):
 +
    f = open(filename,'r')
 +
    lines = [l.strip().decode('utf8') for l in f.readlines()]
 +
    return lines
  
 
   if __name__ == '__main__':
 
   if __name__ == '__main__':
Zeile 156: Zeile 173:
 
     content = list_content(applications)
 
     content = list_content(applications)
 
     applications = parse_content(content)
 
     applications = parse_content(content)
     write_content(applications)
+
    #Ein Titel per Zeile, TO-Reihenfolge gegeben
 +
    #positions = get_application_positions("reihenfolge-to")
 +
     write_content(applications, positions)

Version vom 20. Juni 2012, 12:06 Uhr

  1. !/usr/bin/env python
 # -*- coding: utf-8 -*-
 import urllib2
 import json
 import os 
 import csv
 API_URL = "http://wiki.piratenbrandenburg.de/api.php"
 CATEGORIES = ["Kategorie:Sonstiger_Antrag_AF_LPT_2012.1",
               "Kategorie:Satzungsänderungsantrag_AF_LPT_2012.1",
               "Kategorie:Programmantrag_AF_LPT_2012.1"]
 MAX_PAGEIDS = 50
 def get_json(endpoint):
   url = .join([
            API_URL,
            '?',
            endpoint,
            '&format=json',
            ])
   return urllib2.urlopen(url).read()
 def get_category(category, query_continue=""):
   data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
   json_data = json.loads(data)
   pages = json_data["query"]["categorymembers"]
   if "query-continue" in json_data:
     pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
   return pages
 def list_applications(categories):
   if os.path.isfile("application_list"):
     f = open('application_list','r')
     return json.load(f)
   return download_applications(categories)
 def download_applications(categories):
   applications = _list_applications(categories)
   f = open('application_list','w+')
   json.dump(applications, f)
   f.flush()
   return applications
 def _list_applications(categories):
   applications = {}
   for category in categories:
     pages = get_category(category)
     applications[category] = pages
   return applications
 def get_raw_pageid(pageid):
   data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
   json_data = json.loads(data)
   pages = json_data["query"]["pages"]
   content = []
   for pageids in pages:
     content += pages[pageids]["revisions"]
   return content
 def chunks(l, n):
   for i in xrange(0, len(l), n):
     yield l[i:i+n]
 def get_pageid(pageids):
   pages = []
   for chunk in chunks(pageids, MAX_PAGEIDS):
     pages += get_raw_pageid("|".join(str(i) for i in chunk))
   return pages
 def _list_content(applications):
   pageids = {}
   content = {}
   for category in applications.iterkeys():
     for application in applications[category]:
       if category in pageids:
         pageids[category] += [application["pageid"]]
       else:
         pageids[category] = [application["pageid"]]
     content[category] = get_pageid(pageids[category])
   return content
 def download_content(applications):
   content = _list_content(applications)
   f = open('content','w+')
   json.dump(content,f)
   f.flush()
   return content
 def list_content(applications):
   if os.path.isfile("content"):
     f = open('content','r')
     return json.load(f)
   return download_content(applications)
 def parse_content(content):
   applications = {}
   for category in content.iterkeys():
     applications_for_category = []
     for application_content in content[category]:
       application = mediawiki_template(application_content["*"])
       if application["Eingereicht"] != "":
         applications_for_category.append(application)
     applications_for_category.sort(key = lambda a: a["Titel"])
     applications[category] = applications_for_category
   return applications
 def mediawiki_template(mw_string):
   """ returns media wiki template element as a hash"""
   #Split content inside Template
   strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
   #remove "Antragsfabrikat"
   strings = strings[1:]
   mw_hash = {}
   for string in strings:
     keyval = string.split("=",1)
     if 2 != len(keyval):
       raise SyntaxError("Mediawiki parsing Error %s" % keyval)
     keyval = [s.strip() for s in keyval]
     key, val = keyval
     mw_hash[key] = val
   return mw_hash
 def filter_content(content):
   """ simple filter for some html tags to plain text"""
   content = content.replace("1","¹")
   content = content.replace("2","²")
   content = content.replace("3","³")
   content = content.replace("
","\n") content = content.replace("<br\>","\n") content = content.replace("<br\\n>","\n") content = content.replace("
","\n") content = content.replace("
","\n") return content


 def write_content(applications, applications_position=[]):
   open_position = []
   open_position.extend(applications_position)
   for category in applications:
     f = open(category,'w+')
     writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
     writer.writerow( ("Number","Title","Text","Reason",
                       "Submitter (First Name)","Submitter (Last Name)"))
     for a in applications[category]:
       try:
         number = applications_position.index(a["Titel"]) + 1
         open_position.remove(a["Titel"])
       except ValueError:
         print '"' + a["Titel"] + '" im Antragsbuch nicht gefunden'
         number = ""
       writer.writerow( ( number, # number starts at 1
                         a["Titel"].encode('utf8'),
                         filter_content(a["Antragstext"].encode('utf8')),
                         filter_content(a[u'Begr\xfcndung'].encode('utf8')),
                         a["Antragsteller"].encode('utf8'),
                         "") ) #Last Name
     f.flush()
     f.close()
   if open_position != []:
     print "Anträge aus dem Antragsbuch, die nicht gefunden wurden: "
     for a in open_position:
       print a
 def get_application_positions(filename):
   f = open(filename,'r')
   lines = [l.strip().decode('utf8') for l in f.readlines()]
   return lines
 if __name__ == '__main__':
   #download_applications(CATEGORIES)
   applications = list_applications(CATEGORIES)
   #download_content(applications)
   content = list_content(applications)
   applications = parse_content(content)
   #Ein Titel per Zeile, TO-Reihenfolge gegeben
   #positions = get_application_positions("reihenfolge-to") 
   write_content(applications, positions)