From 6e522170fb319a950c1620fa4c51c707baac8e1d Mon Sep 17 00:00:00 2001 From: Timothy Allen Date: Tue, 14 Aug 2018 11:05:57 +0200 Subject: [PATCH] Calculate sex and category positions. --- aacstats.py | 22 +++-- load_spreadsheet.py | 161 ++++++++++++++++++++++++++---------- templates/index.html | 4 +- templates/list-runners.html | 2 +- 4 files changed, 134 insertions(+), 55 deletions(-) diff --git a/aacstats.py b/aacstats.py index 39517a2..e5bbfc4 100644 --- a/aacstats.py +++ b/aacstats.py @@ -20,6 +20,14 @@ def urlescape(string): def pace(time): return (dt.datetime(1,1,1) + time).strftime('%M:%S') +@app.template_filter('gender') +def gender(sex): + if sex == 'f': + return 'lady' + elif sex == 'm': + return 'man' + return + @app.template_filter('year') def year(time): return time.strftime('%Y') @@ -69,7 +77,7 @@ def getshow(): return show -def read_db(listing=None, event=None, person=None, licence=None, search=dict(), year=None, finishers=False): +def read_db(listing=None, event=None, person=None, licence=None, search=dict(), year=None): db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC', use_unicode=True, charset="utf8", cursorclass=MySQLdb.cursors.DictCursor) c = db.cursor() @@ -140,7 +148,7 @@ def read_db(listing=None, event=None, person=None, licence=None, search=dict(), where += ' AND CONCAT_WS(" ", name, surname) NOT LIKE "%blank%card%"' where += ' AND CONCAT_WS(" ", name, surname) NOT LIKE "%disqualified%"' if listing == 'races': - select = 'TRIM(event), date' + select = 'TRIM(event) AS event, date' group = 'GROUP BY event, date' order = 'date DESC, TRIM(event)' elif listing == 'runners': @@ -167,9 +175,9 @@ def read_db(listing=None, event=None, person=None, licence=None, search=dict(), lastdate = lastdate.replace(year=int(year)) where += ' AND date > "{}" AND date < "{}"'.format(firstdate, lastdate) ''' This statement is expensive but doesn't increase the count, so don't change the count statement ''' - if finishers: - select = 'total.finishers, query.* FROM( SELECT *' - close = ') AS query INNER JOIN (SELECT event, date, distance, COUNT(event) as finishers FROM `results` GROUP BY event, distance, date) AS total ON total.event=query.event AND total.date=query.date AND total.distance=query.distance' + #if finishers: + # select = 'total.finishers, query.* FROM( SELECT *' + # close = ') AS query INNER JOIN (SELECT event, date, distance, COUNT(event) as finishers FROM `results` GROUP BY event, distance, date) AS total ON total.event=query.event AND total.date=query.date AND total.distance=query.distance' sql = 'SELECT {} FROM `results` {} {} ORDER BY {} {} {};'.format(select, where, group, order, limit, close) @@ -257,7 +265,7 @@ def races(year=None, title=None): def person(title=None, year=None): if title is not None: title = urllib.parse.unquote_plus(title) - results = read_db(person=title, year=year, finishers=True) + results = read_db(person=title, year=year) return render_template('index.html', ltype='person', title=title, results=results, year=year, request=request, getstart=getstart(), getshow=getshow(), now=now(), PAGE_SIZE=PAGE_SIZE) @@ -266,7 +274,7 @@ def person(title=None, year=None): def licence(year=now().year, title=None): if title is not None: title = urllib.parse.unquote_plus(title) - results = read_db(licence=title, year=year, finishers=True) + results = read_db(licence=title, year=year) return render_template('index.html', ltype='licence', title=title, results=results, year=year, request=request, getstart=getstart(), getshow=getshow(), now=now(), PAGE_SIZE=PAGE_SIZE) diff --git a/load_spreadsheet.py b/load_spreadsheet.py index c868c2d..3a2a8b2 100755 --- a/load_spreadsheet.py +++ b/load_spreadsheet.py @@ -15,6 +15,7 @@ import mimetypes import csv import xlrd import MySQLdb +import MySQLdb.cursors import argparse import datetime as dt import dateutil.parser as dtp @@ -45,12 +46,16 @@ def main(): args = parse_arguments() rows = [] - if args.scrape_web: + if args.calculate: + #position_calculations('Winelands Marathon') + position_calculations() + return + elif args.scrape_web: spreadsheets = [] uniqurl = [] wpa = 'http://www.wpa.org.za/Events/DynamicEvents.asmx/BuildEventDisplay' for year in range(2016, dt.datetime.now().year + 1): - log.debug("Finding results for %s" % year); + log.debug("Finding results for {}".format(year)); args = {"WPAExtra":"True","TimeColumn":"True","entityid":"674417","selectedyear":year,"selectedmonth":0,"commissionid":"0","selectedstate":"0","categoryid":0,"themeid":"46"} data = bytes(json.dumps(args).encode('utf8')) req = urllib.request.Request(wpa, data=data, headers={'content-type': 'application/json'}) @@ -79,7 +84,7 @@ def main(): for race in spreadsheets: url = race['url'] with urllib.request.urlopen(url) as response, tempfile.TemporaryDirectory() as tmpdir: - log.info("Loading data from URL %s" % race['url']) + log.info("Loading data from URL {}".format(race['url'])) data = response.read() urlparts = urllib.parse.urlparse(url) filename = os.path.basename(urlparts.path) @@ -89,7 +94,7 @@ def main(): try: rows = read_spreadsheet(filepath, src=url, eventname=race['event'], eventdate=race['date'], eventdistance=race['distance']) except: - log.warning("ERROR: Unable to load data from URL %s" % url) + log.warning("ERROR: Unable to load data from URL {}".format(url)) raise else: load_into_db(rows) @@ -99,7 +104,7 @@ def main(): elif args.input_file: rows = read_spreadsheet(args.input_file, src=args.input_file) - log.info("Loading data from file %s" % args.input_file) + log.info("Loading data from file {}".format(args.input_file)) load_into_db(rows) else: @@ -113,30 +118,83 @@ def main(): if not filename: if not ext: ext = '.xls' # attempt to decode as a spreadsheet - filename = 'part-%03d%s' % (counter, ext) + filename = 'part-{:03}{}'.format(counter, ext) counter += 1 if re.search('.xl(b|s)x?$', filename, flags=re.IGNORECASE) is not None: with tempfile.TemporaryDirectory() as tmpdir: filepath = os.path.join(tmpdir, filename) with open(filepath, 'wb') as fp: fp.write(part.get_payload(decode=True)) - log.info("Loading data from file %s" % filename) + log.info("Loading data from file {}".format(filename)) try: rows = read_spreadsheet(filepath, src=message['from']) - load_into_db(rows) except: - log.info("Unable to load data from file %s" % filename) + log.info("Unable to load data from file {}".format(filename)) pass + else: + load_into_db(rows) + position_calculations() return +def position_calculations(event=None): + db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC', + use_unicode=True, charset="utf8", cursorclass=MySQLdb.cursors.DictCursor) + c = db.cursor() + + where = '' + if event: + where = 'WHERE event LIKE "%{}%"'.format(event) + sql = 'SELECT event, date, distance FROM `results` {} GROUP BY event, date, distance'.format(where) + c.execute(sql) + #log.debug(c._last_executed) + eventlist = [e for e in c.fetchall()] + + for race in eventlist: + log.debug(race) + + log.debug("Recalculating postion information for {}".format(race['event']) + ''' Calculate total finishers per race ''' + sql = 'UPDATE `results` AS r, (SELECT event, date, distance, COUNT(distance) AS finishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY event, date, distance) AS f SET r.finishers = f.finishers WHERE r.event = f.event AND r.date = f.date AND r.distance = f.distance AND r.finishers IS NULL;'.format(race['event'], race['date'], race['distance']) + c.execute(sql) + result = c.fetchall() + + ''' Update total finishers per sex per race ''' + sql = 'UPDATE `results` AS r, (SELECT event, date, distance, sex, COUNT(sex) as sexfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY sex) AS s SET r.sexfinishers = s.sexfinishers WHERE r.event = s.event AND r.date = s.date AND r.distance = s.distance AND r.sexfinishers IS NULL AND r.sex = s.sex;'.format(race['event'], race['date'], race['distance']) + #print(sql) + c.execute(sql) + result = c.fetchall() + + ''' Update individual positions per sex per race ''' + c.execute('SET @rank = 0;') + sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@sex = sex, @rank+1, 1) AS srank, @sex := sex FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex IS NOT NULL ORDER BY sex, position) AS s SET r.sexposition = s.srank WHERE r.result_key = s.result_key AND r.sexposition IS NULL;'.format(race['event'], race['date'], race['distance']) + #print(sql) + c.execute(sql) + result = c.fetchall() + + ''' Update total finishers per category per race ''' + sql = 'UPDATE `results` AS r, (SELECT event, date, distance, category, COUNT(category) as catfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY category) AS c SET r.catfinishers = c.catfinishers WHERE r.event = c.event AND r.date = c.date AND r.distance = c.distance AND r.catfinishers IS NULL AND r.category = c.category;'.format(race['event'], race['date'], race['distance']) + #print(sql) + c.execute(sql) + result = c.fetchall() + + ''' Update individual positions per category per race ''' + c.execute('SET @rank = 0;') + sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@cat = category, @rank+1, 1) AS crank, @cat := category FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category IS NOT NULL ORDER BY category, position) AS c SET r.catposition = c.crank WHERE r.result_key = c.result_key AND r.catposition IS NULL;'.format(race['event'], race['date'], race['distance']) + #print(sql) + c.execute(sql) + result = c.fetchall() + + db.commit() + return + def load_into_db(rows): ''' CREATE TABLE `results` ( `result_key` int(11) NOT NULL AUTO_INCREMENT, `date` datetime DEFAULT NULL, - `distance` float(10) DEFAULT NULL, + `distance` decimal(4,2) DEFAULT NULL, `event` varchar(100) COLLATE utf8_unicode_ci NOT NULL, `position` int(5) NOT NULL, `finishers` int(5) DEFAULT NULL, @@ -147,15 +205,17 @@ def load_into_db(rows): `club` varchar(40) COLLATE utf8_unicode_ci DEFAULT NULL, `age` int(3) DEFAULT NULL, `sex` varchar(10) COLLATE utf8_unicode_ci DEFAULT NULL, + `sexposition` int(5) NOT NULL, + `sexfinishers` int(5) DEFAULT NULL, `category` varchar(15) COLLATE utf8_unicode_ci DEFAULT NULL, - `sexposition` int(5) DEFAULT NULL, - `catposition` int(5) DEFAULT NULL, + `catposition` int(5) NOT NULL, + `catfinishers` int(5) DEFAULT NULL, `source` varchar(200) COLLATE utf8_unicode_ci DEFAULT NULL, PRIMARY KEY (`result_key`) ) ENGINE=InnoDB CHARSET=utf8 COLLATE=utf8_unicode_ci; ''' if rows is None or len(rows) < 1: - log.warning("No data found in spreadsheet") + log.warning("**** No data found in spreadsheet ****") else: db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC', use_unicode=True, charset="utf8") @@ -179,13 +239,13 @@ def load_into_db(rows): for r in rows: fields = ', '.join(r.keys()) values = ', '.join(['%s'] * len(r)) # placeholder values - sql = 'INSERT into `results` ( %s ) VALUES ( %s )' % (fields, values) + sql = 'INSERT into `results` ( {} ) VALUES ( {} )'.format(fields, values) try: c.execute(sql, r.values()) except : e = sys.exc_info()[0] - log.debug("ERROR: %s" % e) - log.debug("Last query was: %s" % c._last_executed) + log.debug("ERROR: {}".format(e)) + log.debug("Last query was: {}".format(c._last_executed)) raise #pass @@ -200,7 +260,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even book = xlrd.open_workbook(spreadsheet) for sheetname in book.sheet_names(): sheet = book.sheet_by_name(sheetname) - log.debug("Processing sheet %s" % sheetname) + log.debug("Processing sheet {}".format(sheetname)) ''' Look for the header in the first 15 rows, searching from the top ''' fields = [] @@ -208,7 +268,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even try: if re.search('((pos\w*|no\w*|num\w*|surname|name|time|club)\s*){2,}', ' '.join(str(x) for x in (sheet.row_values(row))), flags=re.IGNORECASE) is not None: fields = sheet.row_values(row) - log.debug("Spreadsheet fields: %s" % ', '.join(str(x) for x in fields)) + log.debug("Spreadsheet fields: {}".format(', '.join(str(x) for x in fields))) break except: ''' Probably a blank sheet, let's skip ''' @@ -237,7 +297,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even fields[i] = 'club' elif re.search('^\s*age(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None: fields[i] = 'age' - elif re.search('^\s*(sex|gender|m.?f\b|male|female)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None: + elif re.search('^\s*(sex|gender|m.?f|male|female)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None: fields[i] = 'sex' elif re.search('^\s*cat(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None: fields[i] = 'category' @@ -257,14 +317,32 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even ''' Look for the date in the file name, and then look the first 15 rows and override it ''' if eventdate is None: + eventdate = dt.datetime.min filedate = re.search('(20\d{2})', str(filename), flags=re.IGNORECASE) if filedate is not None: eventdate = filedate.group(1) for row in range(0, 15): - if re.search('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4}', str(sheet.cell(row, 0).value), flags=re.IGNORECASE) is not None: - eventdate = sheet.cell(row,0).value + sheetdate = re.search('(\d+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})', str(sheet.cell(row, 0).value), flags=re.IGNORECASE) + if sheetdate is not None: + eventdate = sheetdate.group(1) break - log.info("Race date: %s" % eventdate.strftime('%Y-%m-%d')) + log.info("Race date: {}".format(eventdate)) + #log.info("Race date: {}".format(eventdate.strftime('%Y-%m-%d'))) + + if eventname is None: + ''' Use the filename as the event name :-( ''' + eventname, *_ = os.path.splitext(filename) + eventname = str(eventname) + ''' Clean up common patterns ''' + eventname = re.sub('[\-_]', ' ', eventname, flags=re.IGNORECASE) + eventname = re.sub('results?(\s*book)?', ' ', eventname, flags=re.IGNORECASE) + eventname = re.sub('export', ' ', eventname, flags=re.IGNORECASE) + eventname = re.sub('excel', ' ', eventname, flags=re.IGNORECASE) + eventname = re.sub('\(\d\)', ' ', eventname, flags=re.IGNORECASE) + eventname = re.sub('\d{0,4}20\d{2}\d{0,4}', ' ', eventname, flags=re.IGNORECASE) + eventname = re.sub('\s\s+', ' ', eventname, flags=re.IGNORECASE) + eventname = re.sub('(^\s*|\s*$)', '', eventname, flags=re.IGNORECASE) + log.info("Event name: {}".format(eventname)) ''' Look for the race distance in the sheet name, or in the filename ''' distance = eventdistance @@ -280,22 +358,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even sheetdistance = re.search('(helper|marshal)', sheetname, flags=re.IGNORECASE) if sheetdistance is not None: distance = sheetname - log.info("Race distance: %s" % distance) - - if eventname is None: - ''' Use the filename as the event name :-( ''' - eventname, *_ = os.path.splitext(filename) - eventname = str(eventname) - ''' Clean up common patterns ''' - eventname = re.sub('[\-_]', ' ', eventname, flags=re.IGNORECASE) - eventname = re.sub('results?(\s*book)?', ' ', eventname, flags=re.IGNORECASE) - eventname = re.sub('export', ' ', eventname, flags=re.IGNORECASE) - eventname = re.sub('excel', ' ', eventname, flags=re.IGNORECASE) - eventname = re.sub('\(\d\)', ' ', eventname, flags=re.IGNORECASE) - eventname = re.sub('\d{0,4}20\d{2}\d{0,4}', ' ', eventname, flags=re.IGNORECASE) - eventname = re.sub('\s\s+', ' ', eventname, flags=re.IGNORECASE) - eventname = re.sub('(^\s*|\s*$)', '', eventname, flags=re.IGNORECASE) - log.info("Event name: %s" % eventname) + log.info("Race distance: {}".format(distance)) for row in range(sheet.nrows): ''' TODO: don't assume that the position is the first cell ''' @@ -320,7 +383,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even rows = clean_data(rows) if len(rows) > 0: - log.debug("Sample output: %s" % pp.pformat(rows[0])) + log.debug("Sample output: {}".format(pp.pformat(rows[0]))) return rows @@ -355,12 +418,17 @@ def clean_data(input_rows): length = re.search('([\d\.]+)\s*km', str(ir.get('distance')), flags=re.IGNORECASE) if length is not None: r['distance'] = length.group(1) + else: + r['distance'] = 0 ''' Fix sex ''' - if 'sex' in ir and re.search('^\s*F', str(ir.get('sex')), flags=re.IGNORECASE) is not None: - r['sex'] = 'female' - else: - r['sex'] = 'male' + if 'sex' in ir: + sex = 'sex' in ir and re.search('^\s*(F|M)', str(ir.get('sex')), flags=re.IGNORECASE) + if sex is not None: + if sex.group(1) == 'F': + r['sex'] = 'F' + else: + r['sex'] = 'M' ''' Fix club ''' if re.search('^\s*(AAC\b|Atlantic\s*Athletic)', str(ir.get('club')), flags=re.IGNORECASE) is not None: @@ -387,7 +455,7 @@ def clean_data(input_rows): r[key] = 0 ''' Should be a string ''' - for key in ( 'event', 'name', 'surname', 'licence', 'club', 'category', 'sex', ): + for key in ( 'event', 'name', 'surname', 'licence', 'club', 'category', ): val = ir.get(key) if isinstance(val, float): val = int(val) @@ -410,6 +478,9 @@ def parse_arguments(): parser.add_argument( '--web', '-w', action='store_true', required=False, dest='scrape_web', help='Scrape WPA website') + parser.add_argument( + '--calc', '-c', action='store_true', required=False, dest='calculate', + help='Calculate unset positions in the database') parser.add_argument( '--input', '-i', action='store', required=False, type=str, dest='input_file', help='Manually select the spreadsheet to be imported') @@ -422,7 +493,7 @@ def parse_arguments(): if not os.path.exists(args.input_file) or not os.access(args.input_file, os.R_OK): raise - logging.basicConfig() + logging.basicConfig(format='%(message)s') if args.verbose is not None and args.verbose == 1: log.setLevel(logging.INFO) elif args.verbose is not None and args.verbose >= 2: diff --git a/templates/index.html b/templates/index.html index d04c0fd..13fd946 100644 --- a/templates/index.html +++ b/templates/index.html @@ -37,13 +37,13 @@ Name {{ person|trim|e }} Licence {% if row.licence %}{{ row.licence|trim|e }}{% endif %} Time {{ row.time|e }} - Average Pace {% if row.distance is number and row.distance|int != 0 %}{{ (row.time / row.distance) | pace }} min/KM{% endif %} + Average Pace {% if row.distance is number and row.distance|float != 0 %}{{ (row.time / row.distance|float) | pace }} min/KM{% endif %} {%- if ltype != 'event' -%} Race {{ row.event|trim|e }} ({{ row.distance|trim|e }} KM) {%- endif -%} Date {{ row.date|cleandate|e }} Notes - {%- if row.sex and row.sexposition and row.sexposition | int <= 100 %}{{ row.sexposition|ordinal|e }} {{ row.sex|lower|e }}{% endif -%} + {%- if row.sex and row.sexposition and row.sexposition | int <= 100 %}{{ row.sexposition|ordinal|e }} {{ row.sex|lower|gender|e }}{% endif -%} {%- if row.sexposition and row.sexposition | int <= 100 and row.catposition and row.catposition | int <= 100 %} and {% endif -%} {%- if row.catposition and row.catposition | int <= 100 %}{{ row.catposition|ordinal|e }} in category{% endif -%} diff --git a/templates/list-runners.html b/templates/list-runners.html index f87eb97..1cb1566 100644 --- a/templates/list-runners.html +++ b/templates/list-runners.html @@ -2,7 +2,7 @@ {% include 'head.html' with context %}
-

AAC Results: Top Runners by Race Mileage{% if year %} in {{ year }}{% endif %}

+

AAC Results: Top Athletes by Race Mileage{% if year %} in {{ year }}{% endif %}

{% if results -%} {%- set ns.total = 0 -%} {%- if 'count' in results -%}