Calculate sex and category positions.

This commit is contained in:
Timothy Allen 2018-08-14 11:05:57 +02:00
parent ddd3f64fa3
commit 6e522170fb
4 changed files with 134 additions and 55 deletions

View File

@ -20,6 +20,14 @@ def urlescape(string):
def pace(time):
return (dt.datetime(1,1,1) + time).strftime('%M:%S')
@app.template_filter('gender')
def gender(sex):
if sex == 'f':
return 'lady'
elif sex == 'm':
return 'man'
return
@app.template_filter('year')
def year(time):
return time.strftime('%Y')
@ -69,7 +77,7 @@ def getshow():
return show
def read_db(listing=None, event=None, person=None, licence=None, search=dict(), year=None, finishers=False):
def read_db(listing=None, event=None, person=None, licence=None, search=dict(), year=None):
db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
use_unicode=True, charset="utf8", cursorclass=MySQLdb.cursors.DictCursor)
c = db.cursor()
@ -140,7 +148,7 @@ def read_db(listing=None, event=None, person=None, licence=None, search=dict(),
where += ' AND CONCAT_WS(" ", name, surname) NOT LIKE "%blank%card%"'
where += ' AND CONCAT_WS(" ", name, surname) NOT LIKE "%disqualified%"'
if listing == 'races':
select = 'TRIM(event), date'
select = 'TRIM(event) AS event, date'
group = 'GROUP BY event, date'
order = 'date DESC, TRIM(event)'
elif listing == 'runners':
@ -167,9 +175,9 @@ def read_db(listing=None, event=None, person=None, licence=None, search=dict(),
lastdate = lastdate.replace(year=int(year))
where += ' AND date > "{}" AND date < "{}"'.format(firstdate, lastdate)
''' This statement is expensive but doesn't increase the count, so don't change the count statement '''
if finishers:
select = 'total.finishers, query.* FROM( SELECT *'
close = ') AS query INNER JOIN (SELECT event, date, distance, COUNT(event) as finishers FROM `results` GROUP BY event, distance, date) AS total ON total.event=query.event AND total.date=query.date AND total.distance=query.distance'
#if finishers:
# select = 'total.finishers, query.* FROM( SELECT *'
# close = ') AS query INNER JOIN (SELECT event, date, distance, COUNT(event) as finishers FROM `results` GROUP BY event, distance, date) AS total ON total.event=query.event AND total.date=query.date AND total.distance=query.distance'
sql = 'SELECT {} FROM `results` {} {} ORDER BY {} {} {};'.format(select, where, group, order, limit, close)
@ -257,7 +265,7 @@ def races(year=None, title=None):
def person(title=None, year=None):
if title is not None:
title = urllib.parse.unquote_plus(title)
results = read_db(person=title, year=year, finishers=True)
results = read_db(person=title, year=year)
return render_template('index.html', ltype='person', title=title,
results=results, year=year,
request=request, getstart=getstart(), getshow=getshow(), now=now(), PAGE_SIZE=PAGE_SIZE)
@ -266,7 +274,7 @@ def person(title=None, year=None):
def licence(year=now().year, title=None):
if title is not None:
title = urllib.parse.unquote_plus(title)
results = read_db(licence=title, year=year, finishers=True)
results = read_db(licence=title, year=year)
return render_template('index.html', ltype='licence', title=title,
results=results, year=year,
request=request, getstart=getstart(), getshow=getshow(), now=now(), PAGE_SIZE=PAGE_SIZE)

View File

@ -15,6 +15,7 @@ import mimetypes
import csv
import xlrd
import MySQLdb
import MySQLdb.cursors
import argparse
import datetime as dt
import dateutil.parser as dtp
@ -45,12 +46,16 @@ def main():
args = parse_arguments()
rows = []
if args.scrape_web:
if args.calculate:
#position_calculations('Winelands Marathon')
position_calculations()
return
elif args.scrape_web:
spreadsheets = []
uniqurl = []
wpa = 'http://www.wpa.org.za/Events/DynamicEvents.asmx/BuildEventDisplay'
for year in range(2016, dt.datetime.now().year + 1):
log.debug("Finding results for %s" % year);
log.debug("Finding results for {}".format(year));
args = {"WPAExtra":"True","TimeColumn":"True","entityid":"674417","selectedyear":year,"selectedmonth":0,"commissionid":"0","selectedstate":"0","categoryid":0,"themeid":"46"}
data = bytes(json.dumps(args).encode('utf8'))
req = urllib.request.Request(wpa, data=data, headers={'content-type': 'application/json'})
@ -79,7 +84,7 @@ def main():
for race in spreadsheets:
url = race['url']
with urllib.request.urlopen(url) as response, tempfile.TemporaryDirectory() as tmpdir:
log.info("Loading data from URL %s" % race['url'])
log.info("Loading data from URL {}".format(race['url']))
data = response.read()
urlparts = urllib.parse.urlparse(url)
filename = os.path.basename(urlparts.path)
@ -89,7 +94,7 @@ def main():
try:
rows = read_spreadsheet(filepath, src=url, eventname=race['event'], eventdate=race['date'], eventdistance=race['distance'])
except:
log.warning("ERROR: Unable to load data from URL %s" % url)
log.warning("ERROR: Unable to load data from URL {}".format(url))
raise
else:
load_into_db(rows)
@ -99,7 +104,7 @@ def main():
elif args.input_file:
rows = read_spreadsheet(args.input_file, src=args.input_file)
log.info("Loading data from file %s" % args.input_file)
log.info("Loading data from file {}".format(args.input_file))
load_into_db(rows)
else:
@ -113,21 +118,74 @@ def main():
if not filename:
if not ext:
ext = '.xls' # attempt to decode as a spreadsheet
filename = 'part-%03d%s' % (counter, ext)
filename = 'part-{:03}{}'.format(counter, ext)
counter += 1
if re.search('.xl(b|s)x?$', filename, flags=re.IGNORECASE) is not None:
with tempfile.TemporaryDirectory() as tmpdir:
filepath = os.path.join(tmpdir, filename)
with open(filepath, 'wb') as fp:
fp.write(part.get_payload(decode=True))
log.info("Loading data from file %s" % filename)
log.info("Loading data from file {}".format(filename))
try:
rows = read_spreadsheet(filepath, src=message['from'])
load_into_db(rows)
except:
log.info("Unable to load data from file %s" % filename)
log.info("Unable to load data from file {}".format(filename))
pass
else:
load_into_db(rows)
position_calculations()
return
def position_calculations(event=None):
db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
use_unicode=True, charset="utf8", cursorclass=MySQLdb.cursors.DictCursor)
c = db.cursor()
where = ''
if event:
where = 'WHERE event LIKE "%{}%"'.format(event)
sql = 'SELECT event, date, distance FROM `results` {} GROUP BY event, date, distance'.format(where)
c.execute(sql)
#log.debug(c._last_executed)
eventlist = [e for e in c.fetchall()]
for race in eventlist:
log.debug(race)
log.debug("Recalculating postion information for {}".format(race['event'])
''' Calculate total finishers per race '''
sql = 'UPDATE `results` AS r, (SELECT event, date, distance, COUNT(distance) AS finishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY event, date, distance) AS f SET r.finishers = f.finishers WHERE r.event = f.event AND r.date = f.date AND r.distance = f.distance AND r.finishers IS NULL;'.format(race['event'], race['date'], race['distance'])
c.execute(sql)
result = c.fetchall()
''' Update total finishers per sex per race '''
sql = 'UPDATE `results` AS r, (SELECT event, date, distance, sex, COUNT(sex) as sexfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY sex) AS s SET r.sexfinishers = s.sexfinishers WHERE r.event = s.event AND r.date = s.date AND r.distance = s.distance AND r.sexfinishers IS NULL AND r.sex = s.sex;'.format(race['event'], race['date'], race['distance'])
#print(sql)
c.execute(sql)
result = c.fetchall()
''' Update individual positions per sex per race '''
c.execute('SET @rank = 0;')
sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@sex = sex, @rank+1, 1) AS srank, @sex := sex FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex IS NOT NULL ORDER BY sex, position) AS s SET r.sexposition = s.srank WHERE r.result_key = s.result_key AND r.sexposition IS NULL;'.format(race['event'], race['date'], race['distance'])
#print(sql)
c.execute(sql)
result = c.fetchall()
''' Update total finishers per category per race '''
sql = 'UPDATE `results` AS r, (SELECT event, date, distance, category, COUNT(category) as catfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY category) AS c SET r.catfinishers = c.catfinishers WHERE r.event = c.event AND r.date = c.date AND r.distance = c.distance AND r.catfinishers IS NULL AND r.category = c.category;'.format(race['event'], race['date'], race['distance'])
#print(sql)
c.execute(sql)
result = c.fetchall()
''' Update individual positions per category per race '''
c.execute('SET @rank = 0;')
sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@cat = category, @rank+1, 1) AS crank, @cat := category FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category IS NOT NULL ORDER BY category, position) AS c SET r.catposition = c.crank WHERE r.result_key = c.result_key AND r.catposition IS NULL;'.format(race['event'], race['date'], race['distance'])
#print(sql)
c.execute(sql)
result = c.fetchall()
db.commit()
return
@ -136,7 +194,7 @@ def load_into_db(rows):
CREATE TABLE `results` (
`result_key` int(11) NOT NULL AUTO_INCREMENT,
`date` datetime DEFAULT NULL,
`distance` float(10) DEFAULT NULL,
`distance` decimal(4,2) DEFAULT NULL,
`event` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
`position` int(5) NOT NULL,
`finishers` int(5) DEFAULT NULL,
@ -147,15 +205,17 @@ def load_into_db(rows):
`club` varchar(40) COLLATE utf8_unicode_ci DEFAULT NULL,
`age` int(3) DEFAULT NULL,
`sex` varchar(10) COLLATE utf8_unicode_ci DEFAULT NULL,
`sexposition` int(5) NOT NULL,
`sexfinishers` int(5) DEFAULT NULL,
`category` varchar(15) COLLATE utf8_unicode_ci DEFAULT NULL,
`sexposition` int(5) DEFAULT NULL,
`catposition` int(5) DEFAULT NULL,
`catposition` int(5) NOT NULL,
`catfinishers` int(5) DEFAULT NULL,
`source` varchar(200) COLLATE utf8_unicode_ci DEFAULT NULL,
PRIMARY KEY (`result_key`)
) ENGINE=InnoDB CHARSET=utf8 COLLATE=utf8_unicode_ci;
'''
if rows is None or len(rows) < 1:
log.warning("No data found in spreadsheet")
log.warning("**** No data found in spreadsheet ****")
else:
db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
use_unicode=True, charset="utf8")
@ -179,13 +239,13 @@ def load_into_db(rows):
for r in rows:
fields = ', '.join(r.keys())
values = ', '.join(['%s'] * len(r)) # placeholder values
sql = 'INSERT into `results` ( %s ) VALUES ( %s )' % (fields, values)
sql = 'INSERT into `results` ( {} ) VALUES ( {} )'.format(fields, values)
try:
c.execute(sql, r.values())
except :
e = sys.exc_info()[0]
log.debug("ERROR: %s" % e)
log.debug("Last query was: %s" % c._last_executed)
log.debug("ERROR: {}".format(e))
log.debug("Last query was: {}".format(c._last_executed))
raise
#pass
@ -200,7 +260,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
book = xlrd.open_workbook(spreadsheet)
for sheetname in book.sheet_names():
sheet = book.sheet_by_name(sheetname)
log.debug("Processing sheet %s" % sheetname)
log.debug("Processing sheet {}".format(sheetname))
''' Look for the header in the first 15 rows, searching from the top '''
fields = []
@ -208,7 +268,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
try:
if re.search('((pos\w*|no\w*|num\w*|surname|name|time|club)\s*){2,}', ' '.join(str(x) for x in (sheet.row_values(row))), flags=re.IGNORECASE) is not None:
fields = sheet.row_values(row)
log.debug("Spreadsheet fields: %s" % ', '.join(str(x) for x in fields))
log.debug("Spreadsheet fields: {}".format(', '.join(str(x) for x in fields)))
break
except:
''' Probably a blank sheet, let's skip '''
@ -237,7 +297,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
fields[i] = 'club'
elif re.search('^\s*age(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'age'
elif re.search('^\s*(sex|gender|m.?f\b|male|female)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
elif re.search('^\s*(sex|gender|m.?f|male|female)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'sex'
elif re.search('^\s*cat(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'category'
@ -257,14 +317,32 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
''' Look for the date in the file name, and then look the first 15 rows and override it '''
if eventdate is None:
eventdate = dt.datetime.min
filedate = re.search('(20\d{2})', str(filename), flags=re.IGNORECASE)
if filedate is not None:
eventdate = filedate.group(1)
for row in range(0, 15):
if re.search('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4}', str(sheet.cell(row, 0).value), flags=re.IGNORECASE) is not None:
eventdate = sheet.cell(row,0).value
sheetdate = re.search('(\d+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})', str(sheet.cell(row, 0).value), flags=re.IGNORECASE)
if sheetdate is not None:
eventdate = sheetdate.group(1)
break
log.info("Race date: %s" % eventdate.strftime('%Y-%m-%d'))
log.info("Race date: {}".format(eventdate))
#log.info("Race date: {}".format(eventdate.strftime('%Y-%m-%d')))
if eventname is None:
''' Use the filename as the event name :-( '''
eventname, *_ = os.path.splitext(filename)
eventname = str(eventname)
''' Clean up common patterns '''
eventname = re.sub('[\-_]', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('results?(\s*book)?', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('export', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('excel', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\(\d\)', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\d{0,4}20\d{2}\d{0,4}', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\s\s+', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('(^\s*|\s*$)', '', eventname, flags=re.IGNORECASE)
log.info("Event name: {}".format(eventname))
''' Look for the race distance in the sheet name, or in the filename '''
distance = eventdistance
@ -280,22 +358,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
sheetdistance = re.search('(helper|marshal)', sheetname, flags=re.IGNORECASE)
if sheetdistance is not None:
distance = sheetname
log.info("Race distance: %s" % distance)
if eventname is None:
''' Use the filename as the event name :-( '''
eventname, *_ = os.path.splitext(filename)
eventname = str(eventname)
''' Clean up common patterns '''
eventname = re.sub('[\-_]', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('results?(\s*book)?', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('export', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('excel', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\(\d\)', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\d{0,4}20\d{2}\d{0,4}', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\s\s+', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('(^\s*|\s*$)', '', eventname, flags=re.IGNORECASE)
log.info("Event name: %s" % eventname)
log.info("Race distance: {}".format(distance))
for row in range(sheet.nrows):
''' TODO: don't assume that the position is the first cell '''
@ -320,7 +383,7 @@ def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, even
rows = clean_data(rows)
if len(rows) > 0:
log.debug("Sample output: %s" % pp.pformat(rows[0]))
log.debug("Sample output: {}".format(pp.pformat(rows[0])))
return rows
@ -355,12 +418,17 @@ def clean_data(input_rows):
length = re.search('([\d\.]+)\s*km', str(ir.get('distance')), flags=re.IGNORECASE)
if length is not None:
r['distance'] = length.group(1)
else:
r['distance'] = 0
''' Fix sex '''
if 'sex' in ir and re.search('^\s*F', str(ir.get('sex')), flags=re.IGNORECASE) is not None:
r['sex'] = 'female'
if 'sex' in ir:
sex = 'sex' in ir and re.search('^\s*(F|M)', str(ir.get('sex')), flags=re.IGNORECASE)
if sex is not None:
if sex.group(1) == 'F':
r['sex'] = 'F'
else:
r['sex'] = 'male'
r['sex'] = 'M'
''' Fix club '''
if re.search('^\s*(AAC\b|Atlantic\s*Athletic)', str(ir.get('club')), flags=re.IGNORECASE) is not None:
@ -387,7 +455,7 @@ def clean_data(input_rows):
r[key] = 0
''' Should be a string '''
for key in ( 'event', 'name', 'surname', 'licence', 'club', 'category', 'sex', ):
for key in ( 'event', 'name', 'surname', 'licence', 'club', 'category', ):
val = ir.get(key)
if isinstance(val, float):
val = int(val)
@ -410,6 +478,9 @@ def parse_arguments():
parser.add_argument(
'--web', '-w', action='store_true', required=False, dest='scrape_web',
help='Scrape WPA website')
parser.add_argument(
'--calc', '-c', action='store_true', required=False, dest='calculate',
help='Calculate unset positions in the database')
parser.add_argument(
'--input', '-i', action='store', required=False, type=str, dest='input_file',
help='Manually select the spreadsheet to be imported')
@ -422,7 +493,7 @@ def parse_arguments():
if not os.path.exists(args.input_file) or not os.access(args.input_file, os.R_OK):
raise
logging.basicConfig()
logging.basicConfig(format='%(message)s')
if args.verbose is not None and args.verbose == 1:
log.setLevel(logging.INFO)
elif args.verbose is not None and args.verbose >= 2:

View File

@ -37,13 +37,13 @@
<td class="nowrap"><span class="label">Name</span> <span><a href="{{ url_for('person', title=person|trim|urlescape, start=None) }}">{{ person|trim|e }}</a></span></td>
<td class="nowrap"><span class="label">Licence</span> <span>{% if row.licence %}<a href="{{ url_for('licence', title=row.licence|trim|urlescape, year=row.date|year, start=None, show=ns.show) }}">{{ row.licence|trim|e }}</a>{% endif %}</span></td>
<td><span class="label">Time</span> <span>{{ row.time|e }}</span></td>
<td class="nowrap"><span class="label">Average Pace</span> <span>{% if row.distance is number and row.distance|int != 0 %}{{ (row.time / row.distance) | pace }} min/KM{% endif %}</span></td>
<td class="nowrap"><span class="label">Average Pace</span> <span>{% if row.distance is number and row.distance|float != 0 %}{{ (row.time / row.distance|float) | pace }} min/KM{% endif %}</span></td>
{%- if ltype != 'event' -%}
<td class="long"><span class="label">Race</span> <span><a href="{{ url_for('races', title=row.event|trim|urlescape, year=row.date|year, start=None, show=ns.show) }}">{{ row.event|trim|e }} ({{ row.distance|trim|e }} KM)</a></span></td>
{%- endif -%}
<td class="nowrap"><span class="label">Date</span> <span>{{ row.date|cleandate|e }}</span></td>
<td class="long"><span class="label">Notes</span> <span>
{%- if row.sex and row.sexposition and row.sexposition | int <= 100 %}{{ row.sexposition|ordinal|e }} {{ row.sex|lower|e }}{% endif -%}
{%- if row.sex and row.sexposition and row.sexposition | int <= 100 %}{{ row.sexposition|ordinal|e }} {{ row.sex|lower|gender|e }}{% endif -%}
{%- if row.sexposition and row.sexposition | int <= 100 and row.catposition and row.catposition | int <= 100 %} and {% endif -%}
{%- if row.catposition and row.catposition | int <= 100 %}{{ row.catposition|ordinal|e }} in category{% endif -%}
</span>

View File

@ -2,7 +2,7 @@
{% include 'head.html' with context %}
<article>
<h1>AAC Results: Top Runners by Race Mileage{% if year %} in {{ year }}{% endif %}</h1>
<h1>AAC Results: Top Athletes by Race Mileage{% if year %} in {{ year }}{% endif %}</h1>
{% if results -%}
{%- set ns.total = 0 -%}
{%- if 'count' in results -%}