Calculate sex and category positions.

2018-08-14 11:05:57 +02:00 · 2018-08-14 11:05:57 +02:00 · 6e522170fb
commit 6e522170fb
parent ddd3f64fa3
4 changed files with 134 additions and 55 deletions
--- a/aacstats.py
+++ b/aacstats.py
@ -20,6 +20,14 @@ def urlescape(string):
 def pace(time):
    return (dt.datetime(1,1,1) + time).strftime('%M:%S')

+@app.template_filter('gender')
+def gender(sex):
+    if sex == 'f':
+        return 'lady'
+    elif sex == 'm':
+        return 'man'
+    return
+
@app.template_filter('year')
 def year(time):
    return time.strftime('%Y')
@ -69,7 +77,7 @@ def getshow():
    return show


-def read_db(listing=None, event=None, person=None, licence=None, search=dict(), year=None, finishers=False):
+def read_db(listing=None, event=None, person=None, licence=None, search=dict(), year=None):
    db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
                         use_unicode=True, charset="utf8", cursorclass=MySQLdb.cursors.DictCursor)
    c =  db.cursor()
@ -140,7 +148,7 @@ def read_db(listing=None, event=None, person=None, licence=None, search=dict(),
        where += ' AND CONCAT_WS(" ", name, surname) NOT LIKE "%blank%card%"'
        where += ' AND CONCAT_WS(" ", name, surname) NOT LIKE "%disqualified%"'
        if listing == 'races':
-            select = 'TRIM(event), date'
+            select = 'TRIM(event) AS event, date'
            group  = 'GROUP BY event, date'
            order  = 'date DESC, TRIM(event)'
        elif listing == 'runners':
@ -167,9 +175,9 @@ def read_db(listing=None, event=None, person=None, licence=None, search=dict(),
        lastdate  = lastdate.replace(year=int(year))
        where += ' AND date > "{}" AND date < "{}"'.format(firstdate, lastdate)
    ''' This statement is expensive but doesn't increase the count, so don't change the count statement '''
-    if finishers: 
-        select = 'total.finishers, query.* FROM( SELECT *'
-        close =   ') AS query INNER JOIN (SELECT event, date, distance, COUNT(event) as finishers FROM `results` GROUP BY event, distance, date) AS total ON total.event=query.event AND total.date=query.date AND total.distance=query.distance'
+    #if finishers: 
+    #    select = 'total.finishers, query.* FROM( SELECT *'
+    #    close =   ') AS query INNER JOIN (SELECT event, date, distance, COUNT(event) as finishers FROM `results` GROUP BY event, distance, date) AS total ON total.event=query.event AND total.date=query.date AND total.distance=query.distance'


    sql = 'SELECT {} FROM `results` {} {} ORDER BY {} {} {};'.format(select, where, group, order, limit, close)
@ -257,7 +265,7 @@ def races(year=None, title=None):
 def person(title=None, year=None):
    if title is not None:
        title = urllib.parse.unquote_plus(title)
-    results = read_db(person=title, year=year, finishers=True)
+    results = read_db(person=title, year=year)
    return render_template('index.html', ltype='person', title=title,
                            results=results, year=year, 
                            request=request, getstart=getstart(), getshow=getshow(), now=now(), PAGE_SIZE=PAGE_SIZE)
@ -266,7 +274,7 @@ def person(title=None, year=None):
 def licence(year=now().year, title=None):
    if title is not None:
        title = urllib.parse.unquote_plus(title)
-    results = read_db(licence=title, year=year, finishers=True)
+    results = read_db(licence=title, year=year)
    return render_template('index.html', ltype='licence', title=title,
                            results=results, year=year, 
                            request=request, getstart=getstart(), getshow=getshow(), now=now(), PAGE_SIZE=PAGE_SIZE)
--- a/load_spreadsheet.py
+++ b/load_spreadsheet.py
@ -15,6 +15,7 @@ import mimetypes
 import csv
 import xlrd
 import MySQLdb
+import MySQLdb.cursors
 import argparse
 import datetime as dt
 import dateutil.parser as dtp
@ -45,12 +46,16 @@ def main():
  args = parse_arguments()
  rows = []

-  if args.scrape_web:
+  if args.calculate:
+    #position_calculations('Winelands Marathon')
+    position_calculations()
+    return
+  elif args.scrape_web:
    spreadsheets = []
    uniqurl = []
    wpa = 'http://www.wpa.org.za/Events/DynamicEvents.asmx/BuildEventDisplay'
    for year in range(2016, dt.datetime.now().year + 1):
-      log.debug("Finding results for %s" % year);
+      log.debug("Finding results for {}".format(year));
      args = {"WPAExtra":"True","TimeColumn":"True","entityid":"674417","selectedyear":year,"selectedmonth":0,"commissionid":"0","selectedstate":"0","categoryid":0,"themeid":"46"}
      data = bytes(json.dumps(args).encode('utf8'))
      req = urllib.request.Request(wpa, data=data, headers={'content-type': 'application/json'})
@ -79,7 +84,7 @@ def main():
    for race in spreadsheets:
      url = race['url']
      with urllib.request.urlopen(url) as response, tempfile.TemporaryDirectory() as tmpdir:
-        log.info("Loading data from URL %s" % race['url'])
+        log.info("Loading data from URL {}".format(race['url']))
        data = response.read()
        urlparts = urllib.parse.urlparse(url)
        filename = os.path.basename(urlparts.path)
@ -89,7 +94,7 @@ def main():
          try:
            rows = read_spreadsheet(filepath, src=url, eventname=race['event'], eventdate=race['date'], eventdistance=race['distance'])
          except:
-            log.warning("ERROR: Unable to load data from URL %s" % url)
+            log.warning("ERROR: Unable to load data from URL {}".format(url))
            raise
          else:
            load_into_db(rows)
@ -99,7 +104,7 @@ def main():
      
  elif args.input_file:
    rows = read_spreadsheet(args.input_file, src=args.input_file)
-    log.info("Loading data from file %s" % args.input_file)
+    log.info("Loading data from file {}".format(args.input_file))
    load_into_db(rows)

  else:
@ -113,21 +118,74 @@ def main():
        if not filename:
          if not ext:
            ext = '.xls' # attempt to decode as a spreadsheet
-          filename = 'part-%03d%s' % (counter, ext)
+          filename = 'part-{:03}{}'.format(counter, ext)
        counter += 1
        if re.search('.xl(b|s)x?$', filename, flags=re.IGNORECASE) is not None:
          with tempfile.TemporaryDirectory() as tmpdir:
            filepath = os.path.join(tmpdir, filename)
            with open(filepath, 'wb') as fp:
              fp.write(part.get_payload(decode=True))
-              log.info("Loading data from file %s" % filename)
+              log.info("Loading data from file {}".format(filename))
              try:
                rows = read_spreadsheet(filepath, src=message['from'])
-                load_into_db(rows)
              except:
-                log.info("Unable to load data from file %s" % filename)
+                log.info("Unable to load data from file {}".format(filename))
                pass
+              else:
+                load_into_db(rows)

+  position_calculations()
+  return
+
+def position_calculations(event=None):
+  db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC', 
+                       use_unicode=True, charset="utf8", cursorclass=MySQLdb.cursors.DictCursor)
+  c = db.cursor()
+
+  where = ''
+  if event:
+    where = 'WHERE event LIKE "%{}%"'.format(event)
+  sql = 'SELECT event, date, distance FROM `results` {} GROUP BY event, date, distance'.format(where)
+  c.execute(sql)
+  #log.debug(c._last_executed)
+  eventlist = [e for e in c.fetchall()]
+
+  for race in eventlist:
+    log.debug(race)
+
+    log.debug("Recalculating postion information for {}".format(race['event'])
+    ''' Calculate total finishers per race '''
+    sql = 'UPDATE `results` AS r, (SELECT event, date, distance, COUNT(distance) AS finishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY event, date, distance) AS f SET r.finishers = f.finishers WHERE r.event = f.event AND r.date = f.date AND r.distance = f.distance AND r.finishers IS NULL;'.format(race['event'], race['date'], race['distance'])
+    c.execute(sql)
+    result = c.fetchall()
+
+    ''' Update total finishers per sex per race '''
+    sql = 'UPDATE `results` AS r, (SELECT event, date, distance, sex, COUNT(sex) as sexfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY sex) AS s SET r.sexfinishers = s.sexfinishers WHERE r.event = s.event AND r.date = s.date AND r.distance = s.distance AND r.sexfinishers IS NULL AND r.sex = s.sex;'.format(race['event'], race['date'], race['distance'])
+    #print(sql)
+    c.execute(sql)
+    result = c.fetchall()
+
+    ''' Update individual positions per sex per race '''
+    c.execute('SET @rank = 0;')
+    sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@sex = sex, @rank+1, 1) AS srank, @sex := sex FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex IS NOT NULL ORDER BY sex, position) AS s SET r.sexposition = s.srank WHERE r.result_key = s.result_key AND r.sexposition IS NULL;'.format(race['event'], race['date'], race['distance'])
+    #print(sql)
+    c.execute(sql)
+    result = c.fetchall()
+
+    ''' Update total finishers per category per race '''
+    sql = 'UPDATE `results` AS r, (SELECT event, date, distance, category, COUNT(category) as catfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY category) AS c SET r.catfinishers = c.catfinishers WHERE r.event = c.event AND r.date = c.date AND r.distance = c.distance AND r.catfinishers IS NULL AND r.category = c.category;'.format(race['event'], race['date'], race['distance'])
+    #print(sql)
+    c.execute(sql)
+    result = c.fetchall()
+
+    ''' Update individual positions per category per race '''
+    c.execute('SET @rank = 0;')
+    sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@cat = category, @rank+1, 1) AS crank, @cat := category FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category IS NOT NULL ORDER BY category, position) AS c SET r.catposition = c.crank WHERE r.result_key = c.result_key AND r.catposition IS NULL;'.format(race['event'], race['date'], race['distance'])
+    #print(sql)
+    c.execute(sql)
+    result = c.fetchall()
+
+  db.commit()
  return
  

@ -136,7 +194,7 @@ def load_into_db(rows):
    CREATE TABLE `results` (
      `result_key` int(11) NOT NULL AUTO_INCREMENT,
      `date` datetime DEFAULT NULL,
-      `distance` float(10) DEFAULT NULL,
+      `distance` decimal(4,2) DEFAULT NULL,
      `event` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
      `position` int(5) NOT NULL,
      `finishers` int(5) DEFAULT NULL,
@ -147,15 +205,17 @@ def load_into_db(rows):
      `club` varchar(40) COLLATE utf8_unicode_ci DEFAULT NULL,
      `age` int(3) DEFAULT NULL,
      `sex` varchar(10) COLLATE utf8_unicode_ci DEFAULT NULL,
+      `sexposition` int(5) NOT NULL,
+      `sexfinishers` int(5) DEFAULT NULL,
      `category` varchar(15) COLLATE utf8_unicode_ci DEFAULT NULL,
-      `sexposition` int(5) DEFAULT NULL,
-      `catposition` int(5) DEFAULT NULL,
+      `catposition` int(5) NOT NULL,
+      `catfinishers` int(5) DEFAULT NULL,
      `source` varchar(200) COLLATE utf8_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`result_key`)
    ) ENGINE=InnoDB CHARSET=utf8 COLLATE=utf8_unicode_ci;
  '''
  if rows is None or len(rows) < 1:
-    log.warning("No data found in spreadsheet")
+    log.warning("**** No data found in spreadsheet ****")
  else:
    db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC', 
                         use_unicode=True, charset="utf8")
@ -179,13 +239,13 @@ def load_into_db(rows):
    for r in rows:
      fields = ', '.join(r.keys())
      values = ', '.join(['%s'] * len(r)) # placeholder values
-      sql = 'INSERT into `results` ( %s ) VALUES ( %s )' % (fields, values)
+      sql = 'INSERT into `results` ( {} ) VALUES ( {} )'.format(fields, values)
      try:
        c.execute(sql, r.values())
      except :
        e = sys.exc_info()[0]
-        log.debug("ERROR: %s" % e)
-        log.debug("Last query was: %s" % c._last_executed)
+        log.debug("ERROR: {}".format(e))
+        log.debug("Last query was: {}".format(c._last_executed))
        raise
        #pass

@ -200,7 +260,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
    book = xlrd.open_workbook(spreadsheet)
    for sheetname in book.sheet_names():
      sheet = book.sheet_by_name(sheetname)
-      log.debug("Processing sheet %s" % sheetname)
+      log.debug("Processing sheet {}".format(sheetname))

      ''' Look for the header in the first 15 rows, searching from the top '''
      fields = []
@ -208,7 +268,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
        try:
          if re.search('((pos\w*|no\w*|num\w*|surname|name|time|club)\s*){2,}', ' '.join(str(x) for x in (sheet.row_values(row))), flags=re.IGNORECASE) is not None:
            fields = sheet.row_values(row)
-            log.debug("Spreadsheet fields: %s" % ', '.join(str(x) for x in fields))
+            log.debug("Spreadsheet fields: {}".format(', '.join(str(x) for x in fields)))
            break
        except:
          ''' Probably a blank sheet, let's skip '''
@ -237,7 +297,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
          fields[i] = 'club'
        elif re.search('^\s*age(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'age'
-        elif re.search('^\s*(sex|gender|m.?f\b|male|female)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
+        elif re.search('^\s*(sex|gender|m.?f|male|female)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'sex'
        elif re.search('^\s*cat(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'category'
@ -257,14 +317,32 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even

      ''' Look for the date in the file name, and then look the first 15 rows and override it '''
      if eventdate is None:
+        eventdate = dt.datetime.min
        filedate = re.search('(20\d{2})', str(filename), flags=re.IGNORECASE)
        if filedate is not None:
          eventdate = filedate.group(1)
        for row in range(0, 15):
-          if re.search('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4}', str(sheet.cell(row, 0).value), flags=re.IGNORECASE) is not None:
-            eventdate = sheet.cell(row,0).value
+          sheetdate = re.search('(\d+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})', str(sheet.cell(row, 0).value), flags=re.IGNORECASE)
+          if sheetdate is not None:
+            eventdate = sheetdate.group(1)
            break
-      log.info("Race date: %s" % eventdate.strftime('%Y-%m-%d'))
+      log.info("Race date: {}".format(eventdate))
+      #log.info("Race date: {}".format(eventdate.strftime('%Y-%m-%d')))
+
+      if eventname is None:
+        ''' Use the filename as the event name :-( '''
+        eventname, *_ = os.path.splitext(filename)
+        eventname = str(eventname)
+        ''' Clean up common patterns '''
+        eventname = re.sub('[\-_]', ' ', eventname, flags=re.IGNORECASE)
+        eventname = re.sub('results?(\s*book)?', ' ', eventname, flags=re.IGNORECASE)
+        eventname = re.sub('export', ' ', eventname, flags=re.IGNORECASE)
+        eventname = re.sub('excel', ' ', eventname, flags=re.IGNORECASE)
+        eventname = re.sub('\(\d\)', ' ', eventname, flags=re.IGNORECASE)
+        eventname = re.sub('\d{0,4}20\d{2}\d{0,4}', ' ', eventname, flags=re.IGNORECASE)
+        eventname = re.sub('\s\s+', ' ', eventname, flags=re.IGNORECASE)
+      eventname = re.sub('(^\s*|\s*$)', '', eventname, flags=re.IGNORECASE)
+      log.info("Event name: {}".format(eventname))

      ''' Look for the race distance in the sheet name, or in the filename '''
      distance = eventdistance
@ -280,22 +358,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
      sheetdistance = re.search('(helper|marshal)', sheetname, flags=re.IGNORECASE)
      if sheetdistance is not None:
        distance = sheetname
-      log.info("Race distance: %s" % distance)
-
-      if eventname is None:
-        ''' Use the filename as the event name :-( '''
-        eventname, *_ = os.path.splitext(filename)
-        eventname = str(eventname)
-        ''' Clean up common patterns '''
-        eventname = re.sub('[\-_]', ' ', eventname, flags=re.IGNORECASE)
-        eventname = re.sub('results?(\s*book)?', ' ', eventname, flags=re.IGNORECASE)
-        eventname = re.sub('export', ' ', eventname, flags=re.IGNORECASE)
-        eventname = re.sub('excel', ' ', eventname, flags=re.IGNORECASE)
-        eventname = re.sub('\(\d\)', ' ', eventname, flags=re.IGNORECASE)
-        eventname = re.sub('\d{0,4}20\d{2}\d{0,4}', ' ', eventname, flags=re.IGNORECASE)
-        eventname = re.sub('\s\s+', ' ', eventname, flags=re.IGNORECASE)
-      eventname = re.sub('(^\s*|\s*$)', '', eventname, flags=re.IGNORECASE)
-      log.info("Event name: %s" % eventname)
+      log.info("Race distance: {}".format(distance))

      for row in range(sheet.nrows):
        ''' TODO: don't assume that the position is the first cell '''
@ -320,7 +383,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
   
    rows = clean_data(rows)
    if len(rows) > 0:
-      log.debug("Sample output: %s" % pp.pformat(rows[0]))
+      log.debug("Sample output: {}".format(pp.pformat(rows[0])))
    return rows


@ -355,12 +418,17 @@ def clean_data(input_rows):
    length = re.search('([\d\.]+)\s*km', str(ir.get('distance')), flags=re.IGNORECASE)
    if length is not None:
      r['distance'] = length.group(1)
+    else:
+      r['distance'] = 0

    ''' Fix sex '''
-    if 'sex' in ir and re.search('^\s*F', str(ir.get('sex')), flags=re.IGNORECASE) is not None:
-      r['sex'] = 'female'
+    if 'sex' in ir:
+      sex = 'sex' in ir and re.search('^\s*(F|M)', str(ir.get('sex')), flags=re.IGNORECASE)
+      if sex is not None:
+        if sex.group(1) == 'F':
+          r['sex'] = 'F'
        else:
-      r['sex'] = 'male'
+          r['sex'] = 'M'

    ''' Fix club '''
    if re.search('^\s*(AAC\b|Atlantic\s*Athletic)', str(ir.get('club')), flags=re.IGNORECASE) is not None:
@ -387,7 +455,7 @@ def clean_data(input_rows):
        r[key] = 0

    ''' Should be a string '''
-    for key in ( 'event', 'name', 'surname', 'licence', 'club', 'category', 'sex', ):
+    for key in ( 'event', 'name', 'surname', 'licence', 'club', 'category', ):
      val = ir.get(key)
      if isinstance(val, float):
        val = int(val)
@ -410,6 +478,9 @@ def parse_arguments():
  parser.add_argument(
    '--web', '-w', action='store_true', required=False, dest='scrape_web',
    help='Scrape WPA website')
+  parser.add_argument(
+    '--calc', '-c', action='store_true', required=False, dest='calculate',
+    help='Calculate unset positions in the database')
  parser.add_argument(
    '--input', '-i', action='store', required=False, type=str, dest='input_file',
    help='Manually select the spreadsheet to be imported')
@ -422,7 +493,7 @@ def parse_arguments():
    if not os.path.exists(args.input_file) or not os.access(args.input_file, os.R_OK):
      raise 

-  logging.basicConfig()
+  logging.basicConfig(format='%(message)s')
  if args.verbose is not None and args.verbose == 1:
    log.setLevel(logging.INFO)
  elif args.verbose is not None and args.verbose >= 2:
--- a/templates/index.html
+++ b/templates/index.html
@ -37,13 +37,13 @@
         <td class="nowrap"><span class="label">Name</span> <span><a href="{{ url_for('person', title=person|trim|urlescape, start=None) }}">{{ person|trim|e }}</a></span></td>
         <td class="nowrap"><span class="label">Licence</span> <span>{% if row.licence %}<a href="{{ url_for('licence', title=row.licence|trim|urlescape, year=row.date|year, start=None, show=ns.show) }}">{{ row.licence|trim|e }}</a>{% endif %}</span></td>
         <td><span class="label">Time</span> <span>{{ row.time|e }}</span></td>
-         <td class="nowrap"><span class="label">Average Pace</span> <span>{% if row.distance is number and row.distance|int != 0 %}{{ (row.time / row.distance) | pace }} min/KM{% endif %}</span></td>
+         <td class="nowrap"><span class="label">Average Pace</span> <span>{% if row.distance is number and row.distance|float != 0 %}{{ (row.time / row.distance|float) | pace }} min/KM{% endif %}</span></td>
       {%- if ltype != 'event' -%}
         <td class="long"><span class="label">Race</span> <span><a href="{{ url_for('races', title=row.event|trim|urlescape, year=row.date|year, start=None, show=ns.show) }}">{{ row.event|trim|e }} ({{ row.distance|trim|e }} KM)</a></span></td>
       {%- endif -%}
         <td class="nowrap"><span class="label">Date</span> <span>{{ row.date|cleandate|e }}</span></td>
         <td class="long"><span class="label">Notes</span> <span>
-         {%- if row.sex and row.sexposition and row.sexposition | int <= 100 %}{{ row.sexposition|ordinal|e }} {{ row.sex|lower|e }}{% endif -%}
+         {%- if row.sex and row.sexposition and row.sexposition | int <= 100 %}{{ row.sexposition|ordinal|e }} {{ row.sex|lower|gender|e }}{% endif -%}
         {%- if row.sexposition and row.sexposition | int <= 100 and row.catposition and row.catposition | int <= 100 %} and {% endif -%}
         {%- if row.catposition and row.catposition | int <= 100 %}{{ row.catposition|ordinal|e }} in category{% endif -%}
         </span>
--- a/templates/list-runners.html
+++ b/templates/list-runners.html
@ -2,7 +2,7 @@

 {% include 'head.html' with context %}
 <article>
-<h1>AAC Results: Top Runners by Race Mileage{% if year %} in {{ year }}{% endif %}</h1>
+<h1>AAC Results: Top Athletes by Race Mileage{% if year %} in {{ year }}{% endif %}</h1>
 {% if results -%}
  {%- set ns.total = 0 -%}
  {%- if 'count' in results -%}