AACResults/load_spreadsheet.py

635 lines
28 KiB
Python
Raw Normal View History

2018-08-14 08:54:02 +00:00
#!/usr/bin/python3
# -*- coding: utf-8 -*-
''' Utility to load WPA results sheets into a MySQL database.'''
__author__ = 'Timothy Allen'
__email__ = 'tim@allen.org.za'
__license__ = 'MIT'
import bs4
import urllib.request
import urllib.parse
import json
import mailbox
import email
import mimetypes
import csv
import xlrd
import MySQLdb
2018-08-14 09:05:57 +00:00
import MySQLdb.cursors
2018-08-14 08:54:02 +00:00
import argparse
import datetime as dt
import dateutil.parser as dtp
import logging
import os
import re
import sys
import tempfile
from collections import defaultdict
2018-08-14 08:54:02 +00:00
import pprint
# Set up MySQL database, if not done
# Read Excel/ODS/CSV database into MySQL
# Check MIME attachments in email for spreadsheet, and load that ***
# Then display the data in a (separate) web application
# The user is in /etc/dovecot/users
# Password is zi6ohYae0OeYie8eegei (not that you'll ever need it)
MAILDIR = '/var/mail/virtual/aac/Maildir'
log = logging.getLogger(__file__)
pp = pprint.PrettyPrinter(indent=4)
def main():
if sys.version_info < (3, 2):
raise Exception(
'Unsupported Python version, please use at least Python 3.2')
args = parse_arguments()
rows = []
2018-08-14 09:05:57 +00:00
if args.calculate:
#position_calculations('Winelands Marathon')
position_calculations(args.calculate)
2018-08-14 09:05:57 +00:00
return
elif args.scrapeurl:
requrls = args.scrapeurl
2018-08-14 08:54:02 +00:00
spreadsheets = []
uniqurl = []
wpa = 'http://www.wpa.org.za/Events/DynamicEvents.asmx/BuildEventDisplay'
for year in range(2016, dt.datetime.now().year + 1):
2018-08-14 09:05:57 +00:00
log.debug("Finding results for {}".format(year));
2018-08-14 08:54:02 +00:00
args = {"WPAExtra":"True","TimeColumn":"True","entityid":"674417","selectedyear":year,"selectedmonth":0,"commissionid":"0","selectedstate":"0","categoryid":0,"themeid":"46"}
data = bytes(json.dumps(args).encode('utf8'))
req = urllib.request.Request(wpa, data=data, headers={'content-type': 'application/json'})
with urllib.request.urlopen(req) as response:
data = json.loads(response.read().decode('utf8'))
page, *_ = data.values() # get the first value
soup = bs4.BeautifulSoup(page, 'html.parser')
for event in soup.find_all('tr'):
raceinfo = dict()
link = event.find('a', href=re.compile('.xlsx?$'))
2018-08-14 08:54:02 +00:00
name = event.find('td', class_=re.compile('EventHeadline'))
date = event.find('td', class_=re.compile('EventDate'), string=re.compile('^\s*\d+[/-]\d+[/-]\d+'))
dist = event.find('td', class_=re.compile('Events?Distance'), string=re.compile('^\s*[\d\.,]+\s*(KM)?\s*$', flags=re.IGNORECASE))
2018-08-14 08:54:02 +00:00
if link is not None and name is not None:
if not link['href'] in uniqurl:
uniqurl.append(link['href'])
raceinfo['url'] = link['href']
raceinfo['event'] = name.string
raceinfo['date'] = dtp.parse(date.string, dayfirst=True)
raceinfo['distance'] = None
if dist is not None:
raceinfo['distance'] = dist.string
spreadsheets.append(raceinfo)
for race in spreadsheets:
url = race['url']
event = race['event']
''' Only parse one spreadsheet from the WPA website, from the commandline '''
isthisevent = False
for checkurl in requrls:
if checkurl and re.search(checkurl, event, flags=re.IGNORECASE):
isthisevent = True
if requrls[0] and url not in requrls and not isthisevent:
continue
if file_in_db(url):
continue
2018-08-14 08:54:02 +00:00
with urllib.request.urlopen(url) as response, tempfile.TemporaryDirectory() as tmpdir:
data = response.read()
urlparts = urllib.parse.urlparse(url)
filename = os.path.basename(urlparts.path)
if re.search('WALKRESULTS', filename, flags=re.IGNORECASE):
continue
log.info("Loading data from URL {}".format(url))
2018-08-14 08:54:02 +00:00
filepath = os.path.join(tmpdir, filename)
with open(filepath, 'wb') as fp:
fp.write(data)
try:
rows = read_spreadsheet(filepath, src=url, eventname=race['event'], eventdate=race['date'], eventdistance=race['distance'])
except:
2018-08-14 09:05:57 +00:00
log.warning("ERROR: Unable to load data from URL {}".format(url))
2018-08-14 08:54:02 +00:00
raise
else:
load_into_db(rows, event)
log.debug("\n")
2018-08-14 08:54:02 +00:00
elif args.input_file:
rows = read_spreadsheet(args.input_file, src=args.input_file)
2018-08-14 09:05:57 +00:00
log.info("Loading data from file {}".format(args.input_file))
2018-08-14 08:54:02 +00:00
load_into_db(rows)
else:
for message in mailbox.Maildir(MAILDIR):
counter = 1
for part in message.walk():
if part.get_content_maintype() == 'multipart':
continue
filename = part.get_filename()
ext = mimetypes.guess_extension(part.get_content_type())
if not filename:
if not ext:
ext = '.xls' # attempt to decode as a spreadsheet
2018-08-14 09:05:57 +00:00
filename = 'part-{:03}{}'.format(counter, ext)
2018-08-14 08:54:02 +00:00
counter += 1
if re.search('.xl(b|s)x?$', filename, flags=re.IGNORECASE) is not None:
with tempfile.TemporaryDirectory() as tmpdir:
filepath = os.path.join(tmpdir, filename)
with open(filepath, 'wb') as fp:
fp.write(part.get_payload(decode=True))
2018-08-14 09:05:57 +00:00
log.info("Loading data from file {}".format(filename))
2018-08-14 08:54:02 +00:00
try:
rows = read_spreadsheet(filepath, src=message['from'])
except:
2018-08-14 09:05:57 +00:00
log.info("Unable to load data from file {}".format(filename))
2018-08-14 08:54:02 +00:00
pass
2018-08-14 09:05:57 +00:00
else:
load_into_db(rows)
2018-08-14 08:54:02 +00:00
return
def position_calculations(events=None):
2018-08-14 09:05:57 +00:00
db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
use_unicode=True, charset="utf8", cursorclass=MySQLdb.cursors.DictCursor)
c = db.cursor()
where = ''
wheres = []
if isinstance(events, list):
for event in events:
if event:
wheres.append('event LIKE "%{}%"'.format(event))
elif isinstance(events, str):
wheres.append('event LIKE "%{}%"'.format(events))
if wheres:
where = 'WHERE ' + ' OR '.join(wheres)
2018-08-14 09:05:57 +00:00
sql = 'SELECT event, date, distance FROM `results` {} GROUP BY event, date, distance'.format(where)
c.execute(sql)
#log.debug(c._last_executed)
eventlist = [e for e in c.fetchall()]
for race in eventlist:
log.debug("Recalculating position information for {}".format(race['event']))
2018-08-14 09:05:57 +00:00
''' Calculate total finishers per race '''
sql = 'UPDATE `results` AS r, (SELECT event, date, distance, COUNT(distance) AS finishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY event, date, distance) AS f SET r.finishers = f.finishers WHERE r.event = f.event AND r.date = f.date AND r.distance = f.distance AND r.finishers IS NULL;'.format(race['event'], race['date'], race['distance'])
c.execute(sql)
result = c.fetchall()
''' Update total finishers per sex per race '''
sql = 'UPDATE `results` AS r, (SELECT event, date, distance, sex, COUNT(sex) as sexfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY sex) AS s SET r.sexfinishers = s.sexfinishers WHERE r.event = s.event AND r.date = s.date AND r.distance = s.distance AND r.sexfinishers IS NULL AND r.sex = s.sex;'.format(race['event'], race['date'], race['distance'])
#print(sql)
c.execute(sql)
result = c.fetchall()
# ''' Update individual positions per sex per race '''
# sql = 'SELECT DISTINCT sex from `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex IS NOT NULL;'.format(race['event'], race['date'], race['distance'])
# c.execute(sql)
# result = c.fetchall()
# for row in result:
# sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := @rank+1 AS rank FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex LIKE "{}" ORDER BY position) AS c, (SELECT @rank := 0) AS n SET r.sexposition = c.rank WHERE r.result_key = c.result_key;'.format(race['event'], race['date'], race['distance'], row['sex'])
# print(sql)
# c.execute(sql)
# result = c.fetchall()
# # ''' This seems to generate a universal ranking on 1 the first time the statement is run '''
# #c.execute('SET @rank = 0;')
# #sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@sex = sex, @rank+1, 1) AS srank, @sex := sex FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex IS NOT NULL ORDER BY sex, position) AS s SET r.sexposition = s.srank WHERE r.result_key = s.result_key;'.format(race['event'], race['date'], race['distance'])
# #print(sql)
# #c.execute(sql)
# #result = c.fetchall()
2018-08-14 09:05:57 +00:00
''' Update total finishers per category per race '''
sql = 'UPDATE `results` AS r, (SELECT event, date, distance, category, COUNT(category) as catfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY category) AS c SET r.catfinishers = c.catfinishers WHERE r.event = c.event AND r.date = c.date AND r.distance = c.distance AND r.catfinishers IS NULL AND r.category = c.category;'.format(race['event'], race['date'], race['distance'])
#print(sql)
c.execute(sql)
result = c.fetchall()
# ''' Update individual positions per category per race '''
# sql = 'SELECT DISTINCT category from `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category IS NOT NULL;'.format(race['event'], race['date'], race['distance'])
# c.execute(sql)
# result = c.fetchall()
# for row in result:
# sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := @rank+1 AS rank FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category LIKE "{}" ORDER BY position) AS c, (SELECT @rank := 0) AS n SET r.catposition = c.rank WHERE r.result_key = c.result_key;'.format(race['event'], race['date'], race['distance'], row['category'])
# print(sql)
# c.execute(sql)
# result = c.fetchall()
# # ''' This seems to generate a universal ranking on 1 the first time the statement is run '''
# #c.execute('SET @rank = 0;')
# #sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@cat = category, @rank+1, 1) AS crank, @cat := category FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category IS NOT NULL ORDER BY category, position) AS c SET r.catposition = c.crank WHERE r.result_key = c.result_key;'.format(race['event'], race['date'], race['distance'])
# #print(sql)
# #c.execute(sql)
# #result = c.fetchall()
2018-08-14 09:05:57 +00:00
db.commit()
return
def file_in_db(url):
db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
use_unicode=True, charset="utf8")
c = db.cursor()
''' Check for duplicate values by DATE and POSITION and RACE and EVENT '''
sql = 'SELECT COUNT(*) FROM `results` WHERE source LIKE %s'
c.execute(sql, (url,))
#log.debug(c._last_executed)
if (c.fetchone()[0] > 0):
return True
return False
2018-08-14 08:54:02 +00:00
def load_into_db(rows, event=None):
2018-08-14 08:54:02 +00:00
'''
CREATE TABLE `results` (
`result_key` int(11) NOT NULL AUTO_INCREMENT,
`date` datetime DEFAULT NULL,
2018-08-14 09:05:57 +00:00
`distance` decimal(4,2) DEFAULT NULL,
2018-08-14 08:54:02 +00:00
`event` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
`position` int(5) NOT NULL,
`finishers` int(5) DEFAULT NULL,
2018-08-14 08:54:02 +00:00
`time` time NOT NULL,
`name` varchar(75) COLLATE utf8_unicode_ci DEFAULT NULL,
`surname` varchar(75) COLLATE utf8_unicode_ci DEFAULT NULL,
`licence` varchar(20) COLLATE utf8_unicode_ci DEFAULT NULL,
`club` varchar(80) COLLATE utf8_unicode_ci DEFAULT NULL,
2018-08-14 08:54:02 +00:00
`age` int(3) DEFAULT NULL,
`sex` varchar(10) COLLATE utf8_unicode_ci DEFAULT NULL,
2018-08-14 09:05:57 +00:00
`sexposition` int(5) NOT NULL,
`sexfinishers` int(5) DEFAULT NULL,
2018-08-14 08:54:02 +00:00
`category` varchar(15) COLLATE utf8_unicode_ci DEFAULT NULL,
2018-08-14 09:05:57 +00:00
`catposition` int(5) NOT NULL,
`catfinishers` int(5) DEFAULT NULL,
2018-08-14 08:54:02 +00:00
`source` varchar(200) COLLATE utf8_unicode_ci DEFAULT NULL,
PRIMARY KEY (`result_key`)
) ENGINE=InnoDB CHARSET=utf8 COLLATE=utf8_unicode_ci;
'''
if rows is None or len(rows) < 1:
2018-08-14 09:24:15 +00:00
log.warning("**** No data found in spreadsheet {} ****".format(event))
2018-08-14 08:54:02 +00:00
else:
2018-08-14 08:57:55 +00:00
db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
use_unicode=True, charset="utf8")
2018-08-14 08:54:02 +00:00
c = db.cursor()
''' Check for duplicate values by DATE and POSITION and RACE and EVENT '''
sql = 'SELECT COUNT(*) FROM `results` WHERE source LIKE %s'
c.execute(sql, (rows[0].get('source'),))
#log.debug(c._last_executed)
2018-08-14 08:54:02 +00:00
if (c.fetchone()[0] > 0):
log.info("Spreadsheet data already loaded")
return
sql = 'SELECT COUNT(*) FROM `results` WHERE date=%s AND position=%s AND distance LIKE %s AND event LIKE %s'
c.execute(sql, (rows[0].get('date'), rows[0].get('position'), rows[0].get('distance'), rows[0].get('event'),))
#log.debug(c._last_executed)
2018-08-14 08:54:02 +00:00
if (c.fetchone()[0] > 0):
log.info("Spreadsheet data already loaded")
return
for r in rows:
fields = ', '.join(r.keys())
values = ', '.join(['%s'] * len(r)) # placeholder values
2018-08-14 09:05:57 +00:00
sql = 'INSERT into `results` ( {} ) VALUES ( {} )'.format(fields, values)
2018-08-14 08:54:02 +00:00
try:
c.execute(sql, r.values())
except :
e = sys.exc_info()[0]
2018-08-14 09:05:57 +00:00
log.debug("ERROR: {}".format(e))
log.debug("Last query was: {}".format(c._last_executed))
2018-08-14 08:54:02 +00:00
raise
#pass
db.commit()
#position_calculations(event)
2018-08-14 08:54:02 +00:00
return
def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, eventdistance=None):
rows = []
filename = os.path.basename(spreadsheet)
if re.search('.xlsx?$', spreadsheet, flags=re.IGNORECASE) is not None:
book = xlrd.open_workbook(spreadsheet)
for sheetname in book.sheet_names():
if re.search('walk', sheetname, flags=re.IGNORECASE) is not None:
continue
2018-08-14 08:54:02 +00:00
sheet = book.sheet_by_name(sheetname)
2018-08-14 09:05:57 +00:00
log.debug("Processing sheet {}".format(sheetname))
2018-08-14 08:54:02 +00:00
''' Look for the header in the first 15 rows, searching from the top '''
fields = []
for row in range(0, 15):
try:
if re.search('((pos\w*|no\w*|num\w*|(last\s*|sur)name|(first\s*)?name|time|club)\s*){2,}', ' '.join(str(x) for x in (sheet.row_values(row))), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields = sheet.row_values(row)
2018-08-14 09:05:57 +00:00
log.debug("Spreadsheet fields: {}".format(', '.join(str(x) for x in fields)))
2018-08-14 08:54:02 +00:00
break
except:
''' Probably a blank sheet, let's skip '''
continue
''' Translate field names, and delete unwanted fields '''
position_idx = None
time_idx = {}
2018-08-14 08:54:02 +00:00
for i in range(len(fields)):
if 'position' not in fields and re.search('^\s*(overall)?\s*(pos|place|index)', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'position'
''' Store the index of this field for later processing '''
position_idx = i
elif 'time' not in fields and re.search('^\s*(race\s*)?(finish|elapsed_?|f\S?|net|chip)?\s*(time|h:?m:?s?)', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'time'
''' Store the index of this field for later processing '''
time_idx[fields[i]] = i
elif re.search('^\s*start\s*time', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'starttime'
time_idx[fields[i]] = i
elif re.search('^\s*(age\s*)?cat\S*\s*pos(\.|\w+)?\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'catposition'
elif re.search('^\s*(sex|gender)\s*pos(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'sexposition'
elif re.search('^\s*pos(\.|\w+)\s*(sex|gender)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'sexposition'
elif re.search('^\s*(sur|last)\s*name', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'surname'
elif re.search('^\s*((first|nick)?\s*name|participant)', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'name'
elif re.search('^\s*(club(\.|\w*)|team)\s*(name)?\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'club'
elif re.search('^\s*age(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'age'
2018-08-14 09:05:57 +00:00
elif re.search('^\s*(sex|gender|m.?f|male|female)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'sex'
elif re.search('^\s*((age\s*)?cat|extra group)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'category'
elif re.search('^\s*(race)?\s*(lic|no|num)(\.|\S*)\s*\S*\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
2018-08-14 08:54:02 +00:00
fields[i] = 'licence'
elif re.search('^\s*(race)?date', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'date'
elif re.search('^\s*(race)?dist(ance)?\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'distance'
elif re.search('^\s*(race)?(event|name)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
fields[i] = 'event'
''' If there isn't a position field or a time field, we don't want this sheet '''
if position_idx is None or time_idx is None:
continue
''' Look for the date in the file name, and then look the first 15 rows and override it '''
if eventdate is None:
2018-08-14 09:05:57 +00:00
eventdate = dt.datetime.min
2018-08-14 08:54:02 +00:00
filedate = re.search('(20\d{2})', str(filename), flags=re.IGNORECASE)
if filedate is not None:
eventdate = filedate.group(1)
for row in range(0, 15):
2018-08-14 09:05:57 +00:00
sheetdate = re.search('(\d+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})', str(sheet.cell(row, 0).value), flags=re.IGNORECASE)
if sheetdate is not None:
eventdate = sheetdate.group(1)
2018-08-14 08:54:02 +00:00
break
2018-08-14 09:05:57 +00:00
log.info("Race date: {}".format(eventdate))
#log.info("Race date: {}".format(eventdate.strftime('%Y-%m-%d')))
if eventname is None:
''' Use the filename as the event name :-( '''
eventname, *_ = os.path.splitext(filename)
eventname = str(eventname)
''' Clean up common patterns '''
eventname = re.sub('[\-_]', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('results?(\s*book)?', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('export', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('excel', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\(\d\)', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\d{0,4}20\d{2}\d{0,4}', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('\s\s+', ' ', eventname, flags=re.IGNORECASE)
eventname = re.sub('(^\s*|\s*$)', '', eventname, flags=re.IGNORECASE)
log.info("Event name: {}".format(eventname))
2018-08-14 08:54:02 +00:00
''' Look for the race distance in the sheet name, or in the filename '''
distance = eventdistance
log.info("Race distance: {}".format(distance))
eventnamedistance = re.search('(Half)?[\s-]*(Marathon)', eventname, flags=re.IGNORECASE)
if eventnamedistance is not None:
if eventnamedistance.group(1) is not None:
distance = 21.1
else:
distance = 42.2
filedistance = re.search('(\d{1,2}([\.,]\d)?)\s*KM', filename, flags=re.IGNORECASE)
if not distance and filedistance is not None:
2018-08-14 08:54:02 +00:00
distance = filedistance.group(1)
eventnamedistance = re.search('([\d\.,]{2,3})\s*KM', eventname, flags=re.IGNORECASE)
2018-08-14 08:54:02 +00:00
if eventnamedistance is not None:
distance = eventnamedistance.group(1)
sheetdistance = re.search('([\d\.,]+\s*KM)', sheetname, flags=re.IGNORECASE)
2018-08-14 08:54:02 +00:00
if sheetdistance is not None:
distance = sheetdistance.group(1)
sheetdistance = re.search('(helper|marshal)', sheetname, flags=re.IGNORECASE)
if sheetdistance is not None:
distance = sheetname
2018-08-14 09:05:57 +00:00
log.info("Race distance: {}".format(distance))
2018-08-14 08:54:02 +00:00
for row in range(sheet.nrows):
''' TODO: don't assume that the position is the first cell '''
if re.search('(^\s*$|[A-Za-z])', str(sheet.cell(row, position_idx).value), flags=re.IGNORECASE) is None:
item = dict()
data = []
for col in range(sheet.ncols):
data.append(sheet.cell(row, col).value)
item = dict(zip(fields, data))
''' If the time has been modified by Excel, unmodify it '''
timecols = ( 'time', 'starttime' )
for timecol in timecols:
if timecol in item and isinstance(item[timecol], float):
try:
item[timecol] = xlrd.xldate_as_tuple(sheet.cell(row, time_idx[timecol]).value, book.datemode)
except:
try:
if book.datemode == 1:
flipdatemode = 0
else:
flipdatemode = 1
item[timecol] = xlrd.xldate_as_tuple(sheet.cell(row, time_idx[timecol]).value, flipdatemode)
except:
continue
''' Skip this row if the date can't be parsed, as it's probably wrong anyway (41 hours or something silly) '''
continue
item['source'] = src
item['date'] = eventdate
item['event'] = eventname
if not 'distance' in item:
item['distance'] = distance
2018-08-14 08:54:02 +00:00
rows.append(item)
2018-08-14 08:54:02 +00:00
rows = clean_data(rows)
if len(rows) > 0:
2018-08-14 09:05:57 +00:00
log.debug("Sample output: {}".format(pp.pformat(rows[0])))
2018-08-14 08:54:02 +00:00
return rows
def clean_data(input_rows):
rows = []
for ir in input_rows:
r = dict()
2018-08-14 08:54:02 +00:00
''' Fix date '''
date = ir.get('date')
if isinstance(date, str):
today = dt.datetime.now()
year = dt.datetime.combine(dt.date(year=today.year, month=1, day=1), dt.time(hour=0, minute=0, second=0))
date = dtp.parse(date, default=year)
r['date'] = date
''' Check time '''
time = ir['time']
#print("1: {} {}".format(time, type(time)))
2018-08-14 08:54:02 +00:00
''' Deal with various formats that xlrd might give us. Note that floats should already be converted to tuples '''
if isinstance(time, tuple):
time = dt.datetime.combine(dt.date(year=1900, month=1, day=1), dt.time(hour=time[3], minute=time[4], second=time[5]))
#print("2: {} {}".format(time, type(time)))
elif isinstance(time, float):
for fmt in ( '%H.%M.%S', '%M.%S', ):
try:
time = dt.datetime.strptime(time, fmt)
#print("3: {} {} {}".format(time, type(time), fmt))
except:
pass
2018-08-14 08:54:02 +00:00
elif isinstance(time, str):
for fmt in ( '%H:%M:%S', '%M:%S', '%H.%M.%S', '%M.%S', ):
2018-08-14 08:54:02 +00:00
try:
time = dt.datetime.strptime(time, fmt)
#print("4: {} {} {}".format(time, type(time), fmt))
2018-08-14 08:54:02 +00:00
except:
pass
''' If time is still a string, ignore it, as it's probably blank/DNF '''
if isinstance(time, str):
continue
if 'starttime' in ir:
starttime = ir['starttime']
#print("Start 1: {} {}".format(starttime, type(starttime)))
if isinstance(starttime, tuple):
starttime = dt.datetime.combine(dt.date(year=1900, month=1, day=1), dt.time(hour=starttime[3], minute=starttime[4], second=starttime[5]))
#print("Start 2: {} {}".format(starttime, type(starttime)))
if isinstance(starttime, float):
for fmt in ( '%H.%M.%S', '%M.%S', ):
try:
starttime = dt.datetime.strptime(starttime, fmt)
#print("Start 3: {} {} {}".format(starttime, type(starttime), fmt))
except:
pass
elif isinstance(starttime, str):
for fmt in ( '%H:%M:%S', '%M:%S', '%H.%M.%S', '%M.%S', ):
try:
starttime = dt.datetime.strptime(starttime, fmt)
#print("Start 4: {} {} {}".format(starttime, type(starttime), fmt))
except:
pass
''' If starttime is still a string, ignore it, as it's probably blank '''
if not isinstance(time, str) and not isinstance(starttime, str):
timedelta = time - starttime
time = dt.datetime.min + timedelta
#print("5: {} {}".format(time, type(time)))
2018-08-14 08:54:02 +00:00
r['time'] = time.time()
''' Fix distance '''
length = re.search('([\d\.,]+)(?:\s*km)?', str(ir.get('distance')), flags=re.IGNORECASE)
2018-08-14 08:54:02 +00:00
if length is not None:
distance = re.sub(",", ".", length.group(1))
r['distance'] = float(distance)
2018-08-14 09:05:57 +00:00
else:
r['distance'] = 0
2018-08-14 08:54:02 +00:00
''' Should be an int '''
for key in ( 'position', 'sexposition', 'catposition', 'age', ):
val = ir.get(key)
if val is not None:
try:
r[key] = int(val)
except:
pass
''' Should be a string '''
2018-08-14 09:05:57 +00:00
for key in ( 'event', 'name', 'surname', 'licence', 'club', 'category', ):
2018-08-14 08:54:02 +00:00
val = ir.get(key)
if isinstance(val, float):
val = int(val)
2018-08-14 08:54:02 +00:00
if val is not None:
try:
r[key] = re.sub('(^\s*|\s*$)', '', str(val))
2018-08-14 08:54:02 +00:00
except:
pass
''' Leave alone '''
for key in ( 'event', 'source', ):
2018-08-14 08:54:02 +00:00
r[key] = ir.get(key)
''' Fix sex '''
if 'sex' in r:
sex = 'sex' in r and re.search('^\s*(F|M)', str(r.get('sex')), flags=re.IGNORECASE)
if sex is not None:
if sex.group(1) == 'F':
r['sex'] = 'F'
else:
r['sex'] = 'M'
elif 'category' in r:
sex = 'category' in r and re.search('^\s*(F|M)', str(r.get('category')), flags=re.IGNORECASE)
if sex is not None:
if sex.group(1) == 'F':
r['sex'] = 'F'
else:
r['sex'] = 'M'
''' Fix club '''
if re.search('^\s*(AAC$|Atlantic\s*Athletic)', str(r.get('club')), flags=re.IGNORECASE) is not None:
r['club'] = 'AAC'
2018-08-14 08:54:02 +00:00
rows.append(r)
''' sort rows by position, then populate sexposition and catposition for each sex and category '''
sorted(rows, key=lambda r: r['position'])
totals = defaultdict(int)
for r in rows:
totals['positions'] += 1
if 'sex' in r:
totals[r['sex']] += 1
#log.debug("{} {}".format(totals[r['sex']], r['sex']))
if 'sexposition' not in r:
r['sexposition'] = totals[r['sex']]
if 'category' in r:
totals[r['category']] += 1
#log.debug("{} {}".format(totals[r['category']], r['category']))
if 'catposition' not in r:
r['catposition'] = totals[r['category']]
for i in rows:
r['finishers'] = totals['positions']
if 'sex' in r:
r['sexfinishers'] = totals[r['sex']]
if 'category' in r:
r['catfinishers'] = totals[r['category']]
2018-08-14 08:54:02 +00:00
return rows
def parse_arguments():
parser = argparse.ArgumentParser(description='Load a spreadsheet containing WPA results into a database')
parser.add_argument(
'--url', '-u', action='append', required=False, dest='scrapeurl', nargs="?",
help='Scrape WPA website, or, if a link is specified, the spreadsheet at that link')
2018-08-14 09:05:57 +00:00
parser.add_argument(
'--calc', '-c', action='append', required=False, dest='calculate', nargs="?",
help='Calculate unset positions in the database, optionally just for specified races')
2018-08-14 08:54:02 +00:00
parser.add_argument(
'--input', '-i', action='store', required=False, type=str, dest='input_file',
help='Manually select the spreadsheet to be imported')
parser.add_argument(
'--verbose', '-v', action='count', required=False, dest='verbose',
help='Print more information')
args = parser.parse_args()
if args.input_file:
if not os.path.exists(args.input_file) or not os.access(args.input_file, os.R_OK):
raise
2018-08-14 09:05:57 +00:00
logging.basicConfig(format='%(message)s')
2018-08-14 08:54:02 +00:00
if args.verbose is not None and args.verbose == 1:
log.setLevel(logging.INFO)
elif args.verbose is not None and args.verbose >= 2:
log.setLevel(logging.DEBUG)
else:
log.setLevel(logging.WARNING)
return args
if __name__ == "__main__":
main()
# vim: set expandtab shiftwidth=2 softtabstop=2 tw=0 :