2018-08-14 08:54:02 +00:00
#!/usr/bin/python3
# -*- coding: utf-8 -*-
''' Utility to load WPA results sheets into a MySQL database. '''
__author__ = ' Timothy Allen '
__email__ = ' tim@allen.org.za '
__license__ = ' MIT '
import bs4
import urllib . request
import urllib . parse
import json
import mailbox
import email
import mimetypes
import csv
import xlrd
import MySQLdb
2018-08-14 09:05:57 +00:00
import MySQLdb . cursors
2018-08-14 08:54:02 +00:00
import argparse
import datetime as dt
import dateutil . parser as dtp
import logging
import os
import re
import sys
import tempfile
2018-08-14 09:08:19 +00:00
from collections import defaultdict
2018-08-14 08:54:02 +00:00
import pprint
# Set up MySQL database, if not done
# Read Excel/ODS/CSV database into MySQL
# Check MIME attachments in email for spreadsheet, and load that ***
# Then display the data in a (separate) web application
# The user is in /etc/dovecot/users
# Password is zi6ohYae0OeYie8eegei (not that you'll ever need it)
MAILDIR = ' /var/mail/virtual/aac/Maildir '
log = logging . getLogger ( __file__ )
pp = pprint . PrettyPrinter ( indent = 4 )
def main ( ) :
if sys . version_info < ( 3 , 2 ) :
raise Exception (
' Unsupported Python version, please use at least Python 3.2 ' )
args = parse_arguments ( )
rows = [ ]
2018-08-14 09:05:57 +00:00
if args . calculate :
#position_calculations('Winelands Marathon')
2018-08-14 09:06:55 +00:00
position_calculations ( args . calculate )
2018-08-14 09:05:57 +00:00
return
2018-08-14 09:06:55 +00:00
elif args . scrapeurl :
requrls = args . scrapeurl
2018-08-14 08:54:02 +00:00
spreadsheets = [ ]
uniqurl = [ ]
wpa = ' http://www.wpa.org.za/Events/DynamicEvents.asmx/BuildEventDisplay '
for year in range ( 2016 , dt . datetime . now ( ) . year + 1 ) :
2018-08-14 09:05:57 +00:00
log . debug ( " Finding results for {} " . format ( year ) ) ;
2018-08-14 08:54:02 +00:00
args = { " WPAExtra " : " True " , " TimeColumn " : " True " , " entityid " : " 674417 " , " selectedyear " : year , " selectedmonth " : 0 , " commissionid " : " 0 " , " selectedstate " : " 0 " , " categoryid " : 0 , " themeid " : " 46 " }
data = bytes ( json . dumps ( args ) . encode ( ' utf8 ' ) )
req = urllib . request . Request ( wpa , data = data , headers = { ' content-type ' : ' application/json ' } )
with urllib . request . urlopen ( req ) as response :
data = json . loads ( response . read ( ) . decode ( ' utf8 ' ) )
page , * _ = data . values ( ) # get the first value
soup = bs4 . BeautifulSoup ( page , ' html.parser ' )
for event in soup . find_all ( ' tr ' ) :
raceinfo = dict ( )
2018-08-14 09:01:26 +00:00
link = event . find ( ' a ' , href = re . compile ( ' .xlsx?$ ' ) )
2018-08-14 08:54:02 +00:00
name = event . find ( ' td ' , class_ = re . compile ( ' EventHeadline ' ) )
2018-08-14 09:08:19 +00:00
date = event . find ( ' td ' , class_ = re . compile ( ' EventDate ' ) , string = re . compile ( ' ^ \ s* \ d+[/-] \ d+[/-] \ d+ ' ) )
dist = event . find ( ' td ' , class_ = re . compile ( ' Events?Distance ' ) , string = re . compile ( ' ^ \ s*[ \ d \ .,]+ \ s*(KM)? \ s*$ ' , flags = re . IGNORECASE ) )
2018-08-14 08:54:02 +00:00
if link is not None and name is not None :
if not link [ ' href ' ] in uniqurl :
uniqurl . append ( link [ ' href ' ] )
raceinfo [ ' url ' ] = link [ ' href ' ]
raceinfo [ ' event ' ] = name . string
raceinfo [ ' date ' ] = dtp . parse ( date . string , dayfirst = True )
raceinfo [ ' distance ' ] = None
if dist is not None :
raceinfo [ ' distance ' ] = dist . string
spreadsheets . append ( raceinfo )
for race in spreadsheets :
2018-08-14 09:06:55 +00:00
url = race [ ' url ' ]
event = race [ ' event ' ]
''' Only parse one spreadsheet from the WPA website, from the commandline '''
isthisevent = False
for checkurl in requrls :
2018-08-14 09:08:19 +00:00
if checkurl and re . search ( checkurl , event , flags = re . IGNORECASE ) :
2018-08-14 09:06:55 +00:00
isthisevent = True
2018-08-14 09:08:19 +00:00
if requrls [ 0 ] and url not in requrls and not isthisevent :
continue
if file_in_db ( url ) :
2018-08-14 09:06:55 +00:00
continue
2018-08-14 08:54:02 +00:00
with urllib . request . urlopen ( url ) as response , tempfile . TemporaryDirectory ( ) as tmpdir :
data = response . read ( )
urlparts = urllib . parse . urlparse ( url )
filename = os . path . basename ( urlparts . path )
2018-08-14 09:08:19 +00:00
if re . search ( ' WALKRESULTS ' , filename , flags = re . IGNORECASE ) :
continue
log . info ( " Loading data from URL {} " . format ( url ) )
2018-08-14 08:54:02 +00:00
filepath = os . path . join ( tmpdir , filename )
with open ( filepath , ' wb ' ) as fp :
fp . write ( data )
try :
rows = read_spreadsheet ( filepath , src = url , eventname = race [ ' event ' ] , eventdate = race [ ' date ' ] , eventdistance = race [ ' distance ' ] )
except :
2018-08-14 09:05:57 +00:00
log . warning ( " ERROR: Unable to load data from URL {} " . format ( url ) )
2018-08-14 08:54:02 +00:00
raise
else :
2018-08-14 09:08:19 +00:00
load_into_db ( rows , event )
2018-08-14 09:01:26 +00:00
log . debug ( " \n " )
2018-08-14 08:54:02 +00:00
elif args . input_file :
rows = read_spreadsheet ( args . input_file , src = args . input_file )
2018-08-14 09:05:57 +00:00
log . info ( " Loading data from file {} " . format ( args . input_file ) )
2018-08-14 08:54:02 +00:00
load_into_db ( rows )
else :
for message in mailbox . Maildir ( MAILDIR ) :
counter = 1
for part in message . walk ( ) :
if part . get_content_maintype ( ) == ' multipart ' :
continue
filename = part . get_filename ( )
ext = mimetypes . guess_extension ( part . get_content_type ( ) )
if not filename :
if not ext :
ext = ' .xls ' # attempt to decode as a spreadsheet
2018-08-14 09:05:57 +00:00
filename = ' part- {:03} {} ' . format ( counter , ext )
2018-08-14 08:54:02 +00:00
counter + = 1
if re . search ( ' .xl(b|s)x?$ ' , filename , flags = re . IGNORECASE ) is not None :
with tempfile . TemporaryDirectory ( ) as tmpdir :
filepath = os . path . join ( tmpdir , filename )
with open ( filepath , ' wb ' ) as fp :
fp . write ( part . get_payload ( decode = True ) )
2018-08-14 09:05:57 +00:00
log . info ( " Loading data from file {} " . format ( filename ) )
2018-08-14 08:54:02 +00:00
try :
rows = read_spreadsheet ( filepath , src = message [ ' from ' ] )
except :
2018-08-14 09:05:57 +00:00
log . info ( " Unable to load data from file {} " . format ( filename ) )
2018-08-14 08:54:02 +00:00
pass
2018-08-14 09:05:57 +00:00
else :
load_into_db ( rows )
2018-08-14 08:54:02 +00:00
return
2018-08-14 09:06:55 +00:00
def position_calculations ( events = None ) :
2018-08-14 09:05:57 +00:00
db = MySQLdb . connect ( user = ' aac ' , passwd = ' saOAcCWHg4LaoSSA ' , db = ' AAC ' ,
use_unicode = True , charset = " utf8 " , cursorclass = MySQLdb . cursors . DictCursor )
c = db . cursor ( )
2018-08-14 09:06:55 +00:00
where = ' '
wheres = [ ]
if isinstance ( events , list ) :
for event in events :
if event :
wheres . append ( ' event LIKE " % {} % " ' . format ( event ) )
elif isinstance ( events , str ) :
wheres . append ( ' event LIKE " % {} % " ' . format ( events ) )
if wheres :
where = ' WHERE ' + ' OR ' . join ( wheres )
2018-08-14 09:05:57 +00:00
sql = ' SELECT event, date, distance FROM `results` {} GROUP BY event, date, distance ' . format ( where )
c . execute ( sql )
#log.debug(c._last_executed)
eventlist = [ e for e in c . fetchall ( ) ]
for race in eventlist :
2018-08-14 09:08:19 +00:00
log . debug ( " Recalculating position information for {} " . format ( race [ ' event ' ] ) )
2018-08-14 09:05:57 +00:00
''' Calculate total finishers per race '''
sql = ' UPDATE `results` AS r, (SELECT event, date, distance, COUNT(distance) AS finishers FROM `results` WHERE event LIKE " {} " AND date = " {} " AND distance LIKE " {} " GROUP BY event, date, distance) AS f SET r.finishers = f.finishers WHERE r.event = f.event AND r.date = f.date AND r.distance = f.distance AND r.finishers IS NULL; ' . format ( race [ ' event ' ] , race [ ' date ' ] , race [ ' distance ' ] )
c . execute ( sql )
result = c . fetchall ( )
''' Update total finishers per sex per race '''
sql = ' UPDATE `results` AS r, (SELECT event, date, distance, sex, COUNT(sex) as sexfinishers FROM `results` WHERE event LIKE " {} " AND date = " {} " AND distance LIKE " {} " GROUP BY sex) AS s SET r.sexfinishers = s.sexfinishers WHERE r.event = s.event AND r.date = s.date AND r.distance = s.distance AND r.sexfinishers IS NULL AND r.sex = s.sex; ' . format ( race [ ' event ' ] , race [ ' date ' ] , race [ ' distance ' ] )
#print(sql)
c . execute ( sql )
result = c . fetchall ( )
2018-08-14 09:08:19 +00:00
# ''' Update individual positions per sex per race '''
# sql = 'SELECT DISTINCT sex from `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex IS NOT NULL;'.format(race['event'], race['date'], race['distance'])
# c.execute(sql)
# result = c.fetchall()
# for row in result:
# sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := @rank+1 AS rank FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex LIKE "{}" ORDER BY position) AS c, (SELECT @rank := 0) AS n SET r.sexposition = c.rank WHERE r.result_key = c.result_key;'.format(race['event'], race['date'], race['distance'], row['sex'])
# print(sql)
# c.execute(sql)
# result = c.fetchall()
# # ''' This seems to generate a universal ranking on 1 the first time the statement is run '''
# #c.execute('SET @rank = 0;')
# #sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@sex = sex, @rank+1, 1) AS srank, @sex := sex FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex IS NOT NULL ORDER BY sex, position) AS s SET r.sexposition = s.srank WHERE r.result_key = s.result_key;'.format(race['event'], race['date'], race['distance'])
# #print(sql)
# #c.execute(sql)
# #result = c.fetchall()
2018-08-14 09:05:57 +00:00
''' Update total finishers per category per race '''
sql = ' UPDATE `results` AS r, (SELECT event, date, distance, category, COUNT(category) as catfinishers FROM `results` WHERE event LIKE " {} " AND date = " {} " AND distance LIKE " {} " GROUP BY category) AS c SET r.catfinishers = c.catfinishers WHERE r.event = c.event AND r.date = c.date AND r.distance = c.distance AND r.catfinishers IS NULL AND r.category = c.category; ' . format ( race [ ' event ' ] , race [ ' date ' ] , race [ ' distance ' ] )
#print(sql)
c . execute ( sql )
result = c . fetchall ( )
2018-08-14 09:08:19 +00:00
# ''' Update individual positions per category per race '''
# sql = 'SELECT DISTINCT category from `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category IS NOT NULL;'.format(race['event'], race['date'], race['distance'])
# c.execute(sql)
# result = c.fetchall()
# for row in result:
# sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := @rank+1 AS rank FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category LIKE "{}" ORDER BY position) AS c, (SELECT @rank := 0) AS n SET r.catposition = c.rank WHERE r.result_key = c.result_key;'.format(race['event'], race['date'], race['distance'], row['category'])
# print(sql)
# c.execute(sql)
# result = c.fetchall()
# # ''' This seems to generate a universal ranking on 1 the first time the statement is run '''
# #c.execute('SET @rank = 0;')
# #sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@cat = category, @rank+1, 1) AS crank, @cat := category FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category IS NOT NULL ORDER BY category, position) AS c SET r.catposition = c.crank WHERE r.result_key = c.result_key;'.format(race['event'], race['date'], race['distance'])
# #print(sql)
# #c.execute(sql)
# #result = c.fetchall()
2018-08-14 09:05:57 +00:00
db . commit ( )
return
2018-08-14 09:08:19 +00:00
def file_in_db ( url ) :
db = MySQLdb . connect ( user = ' aac ' , passwd = ' saOAcCWHg4LaoSSA ' , db = ' AAC ' ,
use_unicode = True , charset = " utf8 " )
c = db . cursor ( )
''' Check for duplicate values by DATE and POSITION and RACE and EVENT '''
sql = ' SELECT COUNT(*) FROM `results` WHERE source LIKE %s '
c . execute ( sql , ( url , ) )
#log.debug(c._last_executed)
if ( c . fetchone ( ) [ 0 ] > 0 ) :
return True
return False
2018-08-14 08:54:02 +00:00
2018-08-14 09:08:19 +00:00
def load_into_db ( rows , event = None ) :
2018-08-14 08:54:02 +00:00
'''
CREATE TABLE ` results ` (
` result_key ` int ( 11 ) NOT NULL AUTO_INCREMENT ,
` date ` datetime DEFAULT NULL ,
2018-08-14 09:05:57 +00:00
` distance ` decimal ( 4 , 2 ) DEFAULT NULL ,
2018-08-14 08:54:02 +00:00
` event ` varchar ( 100 ) COLLATE utf8_unicode_ci NOT NULL ,
` position ` int ( 5 ) NOT NULL ,
2018-08-14 09:01:26 +00:00
` finishers ` int ( 5 ) DEFAULT NULL ,
2018-08-14 08:54:02 +00:00
` time ` time NOT NULL ,
` name ` varchar ( 75 ) COLLATE utf8_unicode_ci DEFAULT NULL ,
` surname ` varchar ( 75 ) COLLATE utf8_unicode_ci DEFAULT NULL ,
` licence ` varchar ( 20 ) COLLATE utf8_unicode_ci DEFAULT NULL ,
2018-08-14 09:08:19 +00:00
` club ` varchar ( 80 ) COLLATE utf8_unicode_ci DEFAULT NULL ,
2018-08-14 08:54:02 +00:00
` age ` int ( 3 ) DEFAULT NULL ,
` sex ` varchar ( 10 ) COLLATE utf8_unicode_ci DEFAULT NULL ,
2018-08-14 09:05:57 +00:00
` sexposition ` int ( 5 ) NOT NULL ,
` sexfinishers ` int ( 5 ) DEFAULT NULL ,
2018-08-14 08:54:02 +00:00
` category ` varchar ( 15 ) COLLATE utf8_unicode_ci DEFAULT NULL ,
2018-08-14 09:05:57 +00:00
` catposition ` int ( 5 ) NOT NULL ,
` catfinishers ` int ( 5 ) DEFAULT NULL ,
2018-08-14 08:54:02 +00:00
` source ` varchar ( 200 ) COLLATE utf8_unicode_ci DEFAULT NULL ,
PRIMARY KEY ( ` result_key ` )
) ENGINE = InnoDB CHARSET = utf8 COLLATE = utf8_unicode_ci ;
'''
if rows is None or len ( rows ) < 1 :
2018-08-14 09:05:57 +00:00
log . warning ( " **** No data found in spreadsheet **** " )
2018-08-14 08:54:02 +00:00
else :
2018-08-14 08:57:55 +00:00
db = MySQLdb . connect ( user = ' aac ' , passwd = ' saOAcCWHg4LaoSSA ' , db = ' AAC ' ,
use_unicode = True , charset = " utf8 " )
2018-08-14 08:54:02 +00:00
c = db . cursor ( )
''' Check for duplicate values by DATE and POSITION and RACE and EVENT '''
sql = ' SELECT COUNT(*) FROM `results` WHERE source LIKE %s '
c . execute ( sql , ( rows [ 0 ] . get ( ' source ' ) , ) )
2018-08-14 09:08:19 +00:00
#log.debug(c._last_executed)
2018-08-14 08:54:02 +00:00
if ( c . fetchone ( ) [ 0 ] > 0 ) :
log . info ( " Spreadsheet data already loaded " )
return
sql = ' SELECT COUNT(*) FROM `results` WHERE date= %s AND position= %s AND distance LIKE %s AND event LIKE %s '
c . execute ( sql , ( rows [ 0 ] . get ( ' date ' ) , rows [ 0 ] . get ( ' position ' ) , rows [ 0 ] . get ( ' distance ' ) , rows [ 0 ] . get ( ' event ' ) , ) )
2018-08-14 09:08:19 +00:00
#log.debug(c._last_executed)
2018-08-14 08:54:02 +00:00
if ( c . fetchone ( ) [ 0 ] > 0 ) :
log . info ( " Spreadsheet data already loaded " )
return
for r in rows :
fields = ' , ' . join ( r . keys ( ) )
values = ' , ' . join ( [ ' %s ' ] * len ( r ) ) # placeholder values
2018-08-14 09:05:57 +00:00
sql = ' INSERT into `results` ( {} ) VALUES ( {} ) ' . format ( fields , values )
2018-08-14 08:54:02 +00:00
try :
c . execute ( sql , r . values ( ) )
except :
e = sys . exc_info ( ) [ 0 ]
2018-08-14 09:05:57 +00:00
log . debug ( " ERROR: {} " . format ( e ) )
log . debug ( " Last query was: {} " . format ( c . _last_executed ) )
2018-08-14 08:54:02 +00:00
raise
#pass
db . commit ( )
2018-08-14 09:08:19 +00:00
#position_calculations(event)
2018-08-14 08:54:02 +00:00
return
def read_spreadsheet ( spreadsheet , src = None , eventname = None , eventdate = None , eventdistance = None ) :
rows = [ ]
filename = os . path . basename ( spreadsheet )
if re . search ( ' .xlsx?$ ' , spreadsheet , flags = re . IGNORECASE ) is not None :
book = xlrd . open_workbook ( spreadsheet )
for sheetname in book . sheet_names ( ) :
2018-08-14 09:08:19 +00:00
if re . search ( ' walk ' , sheetname , flags = re . IGNORECASE ) is not None :
continue
2018-08-14 08:54:02 +00:00
sheet = book . sheet_by_name ( sheetname )
2018-08-14 09:05:57 +00:00
log . debug ( " Processing sheet {} " . format ( sheetname ) )
2018-08-14 08:54:02 +00:00
''' Look for the header in the first 15 rows, searching from the top '''
fields = [ ]
for row in range ( 0 , 15 ) :
try :
2018-08-14 09:08:19 +00:00
if re . search ( ' ((pos \ w*|no \ w*|num \ w*|(last \ s*|sur)name|(first \ s*)?name|time|club) \ s*) { 2,} ' , ' ' . join ( str ( x ) for x in ( sheet . row_values ( row ) ) ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields = sheet . row_values ( row )
2018-08-14 09:05:57 +00:00
log . debug ( " Spreadsheet fields: {} " . format ( ' , ' . join ( str ( x ) for x in fields ) ) )
2018-08-14 08:54:02 +00:00
break
except :
''' Probably a blank sheet, let ' s skip '''
continue
''' Translate field names, and delete unwanted fields '''
position_idx = None
2018-08-14 09:08:19 +00:00
time_idx = { }
2018-08-14 08:54:02 +00:00
for i in range ( len ( fields ) ) :
2018-08-14 09:08:19 +00:00
if ' position ' not in fields and re . search ( ' ^ \ s*(overall)? \ s*(pos|place|index) ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' position '
''' Store the index of this field for later processing '''
position_idx = i
2018-08-14 09:08:19 +00:00
elif ' time ' not in fields and re . search ( ' ^ \ s*(race \ s*)?(finish|elapsed_?|f \ S?|net|chip)? \ s*(time|h:?m:?s?) ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' time '
''' Store the index of this field for later processing '''
2018-08-14 09:08:19 +00:00
time_idx [ fields [ i ] ] = i
elif re . search ( ' ^ \ s*start \ s*time ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
fields [ i ] = ' starttime '
time_idx [ fields [ i ] ] = i
elif re . search ( ' ^ \ s*(age \ s*)?cat \ S* \ s*pos( \ .| \ w+)? \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' catposition '
elif re . search ( ' ^ \ s*(sex|gender) \ s*pos( \ .| \ w*) \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
fields [ i ] = ' sexposition '
2018-08-14 09:08:19 +00:00
elif re . search ( ' ^ \ s*pos( \ .| \ w+) \ s*(sex|gender) \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
fields [ i ] = ' sexposition '
elif re . search ( ' ^ \ s*(sur|last) \ s*name ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' surname '
2018-08-14 09:08:19 +00:00
elif re . search ( ' ^ \ s*((first|nick)? \ s*name|participant) ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' name '
2018-08-14 09:08:19 +00:00
elif re . search ( ' ^ \ s*(club( \ .| \ w*)|team) \ s*(name)? \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' club '
elif re . search ( ' ^ \ s*age( \ .| \ w*) \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
fields [ i ] = ' age '
2018-08-14 09:05:57 +00:00
elif re . search ( ' ^ \ s*(sex|gender|m.?f|male|female)( \ .| \ w*) \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' sex '
2018-08-14 09:08:19 +00:00
elif re . search ( ' ^ \ s*((age \ s*)?cat|extra group)( \ .| \ w*) \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' category '
2018-08-14 09:08:19 +00:00
elif re . search ( ' ^ \ s*(race)? \ s*(lic|no|num)( \ .| \ S*) \ s* \ S* \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
2018-08-14 08:54:02 +00:00
fields [ i ] = ' licence '
elif re . search ( ' ^ \ s*(race)?date ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
fields [ i ] = ' date '
elif re . search ( ' ^ \ s*(race)?dist(ance)? \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
fields [ i ] = ' distance '
elif re . search ( ' ^ \ s*(race)?(event|name) \ s*$ ' , str ( fields [ i ] ) , flags = re . IGNORECASE ) is not None :
fields [ i ] = ' event '
''' If there isn ' t a position field or a time field, we don ' t want this sheet '''
if position_idx is None or time_idx is None :
continue
''' Look for the date in the file name, and then look the first 15 rows and override it '''
if eventdate is None :
2018-08-14 09:05:57 +00:00
eventdate = dt . datetime . min
2018-08-14 08:54:02 +00:00
filedate = re . search ( ' (20 \ d {2} ) ' , str ( filename ) , flags = re . IGNORECASE )
if filedate is not None :
eventdate = filedate . group ( 1 )
for row in range ( 0 , 15 ) :
2018-08-14 09:05:57 +00:00
sheetdate = re . search ( ' ( \ d+ \ s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \ w* \ s+ \ d {4} ) ' , str ( sheet . cell ( row , 0 ) . value ) , flags = re . IGNORECASE )
if sheetdate is not None :
eventdate = sheetdate . group ( 1 )
2018-08-14 08:54:02 +00:00
break
2018-08-14 09:05:57 +00:00
log . info ( " Race date: {} " . format ( eventdate ) )
#log.info("Race date: {}".format(eventdate.strftime('%Y-%m-%d')))
if eventname is None :
''' Use the filename as the event name :-( '''
eventname , * _ = os . path . splitext ( filename )
eventname = str ( eventname )
''' Clean up common patterns '''
eventname = re . sub ( ' [ \ -_] ' , ' ' , eventname , flags = re . IGNORECASE )
eventname = re . sub ( ' results?( \ s*book)? ' , ' ' , eventname , flags = re . IGNORECASE )
eventname = re . sub ( ' export ' , ' ' , eventname , flags = re . IGNORECASE )
eventname = re . sub ( ' excel ' , ' ' , eventname , flags = re . IGNORECASE )
eventname = re . sub ( ' \ ( \ d \ ) ' , ' ' , eventname , flags = re . IGNORECASE )
eventname = re . sub ( ' \ d { 0,4}20 \ d {2} \ d { 0,4} ' , ' ' , eventname , flags = re . IGNORECASE )
eventname = re . sub ( ' \ s \ s+ ' , ' ' , eventname , flags = re . IGNORECASE )
eventname = re . sub ( ' (^ \ s*| \ s*$) ' , ' ' , eventname , flags = re . IGNORECASE )
log . info ( " Event name: {} " . format ( eventname ) )
2018-08-14 08:54:02 +00:00
''' Look for the race distance in the sheet name, or in the filename '''
distance = eventdistance
2018-08-14 09:08:19 +00:00
log . info ( " Race distance: {} " . format ( distance ) )
eventnamedistance = re . search ( ' (Half)?[ \ s-]*(Marathon) ' , eventname , flags = re . IGNORECASE )
if eventnamedistance is not None :
if eventnamedistance . group ( 1 ) is not None :
distance = 21.1
else :
distance = 42.2
filedistance = re . search ( ' ( \ d { 1,2}([ \ .,] \ d)?) \ s*KM ' , filename , flags = re . IGNORECASE )
if not distance and filedistance is not None :
2018-08-14 08:54:02 +00:00
distance = filedistance . group ( 1 )
2018-08-14 09:08:19 +00:00
eventnamedistance = re . search ( ' ([ \ d \ .,] { 2,3}) \ s*KM ' , eventname , flags = re . IGNORECASE )
2018-08-14 08:54:02 +00:00
if eventnamedistance is not None :
distance = eventnamedistance . group ( 1 )
2018-08-14 09:08:19 +00:00
sheetdistance = re . search ( ' ([ \ d \ .,]+ \ s*KM) ' , sheetname , flags = re . IGNORECASE )
2018-08-14 08:54:02 +00:00
if sheetdistance is not None :
distance = sheetdistance . group ( 1 )
sheetdistance = re . search ( ' (helper|marshal) ' , sheetname , flags = re . IGNORECASE )
if sheetdistance is not None :
distance = sheetname
2018-08-14 09:05:57 +00:00
log . info ( " Race distance: {} " . format ( distance ) )
2018-08-14 08:54:02 +00:00
for row in range ( sheet . nrows ) :
''' TODO: don ' t assume that the position is the first cell '''
if re . search ( ' (^ \ s*$|[A-Za-z]) ' , str ( sheet . cell ( row , position_idx ) . value ) , flags = re . IGNORECASE ) is None :
item = dict ( )
data = [ ]
for col in range ( sheet . ncols ) :
data . append ( sheet . cell ( row , col ) . value )
item = dict ( zip ( fields , data ) )
''' If the time has been modified by Excel, unmodify it '''
2018-08-14 09:08:19 +00:00
timecols = ( ' time ' , ' starttime ' )
for timecol in timecols :
if timecol in item and isinstance ( item [ timecol ] , float ) :
try :
item [ timecol ] = xlrd . xldate_as_tuple ( sheet . cell ( row , time_idx [ timecol ] ) . value , book . datemode )
except :
try :
if book . datemode == 1 :
flipdatemode = 0
else :
flipdatemode = 1
item [ timecol ] = xlrd . xldate_as_tuple ( sheet . cell ( row , time_idx [ timecol ] ) . value , flipdatemode )
except :
continue
''' Skip this row if the date can ' t be parsed, as it ' s probably wrong anyway (41 hours or something silly) '''
continue
item [ ' source ' ] = src
item [ ' date ' ] = eventdate
item [ ' event ' ] = eventname
if not ' distance ' in item :
item [ ' distance ' ] = distance
2018-08-14 08:54:02 +00:00
rows . append ( item )
2018-08-14 09:08:19 +00:00
2018-08-14 08:54:02 +00:00
rows = clean_data ( rows )
if len ( rows ) > 0 :
2018-08-14 09:05:57 +00:00
log . debug ( " Sample output: {} " . format ( pp . pformat ( rows [ 0 ] ) ) )
2018-08-14 08:54:02 +00:00
return rows
def clean_data ( input_rows ) :
rows = [ ]
for ir in input_rows :
r = dict ( )
2018-08-14 09:08:19 +00:00
2018-08-14 08:54:02 +00:00
''' Fix date '''
date = ir . get ( ' date ' )
if isinstance ( date , str ) :
today = dt . datetime . now ( )
year = dt . datetime . combine ( dt . date ( year = today . year , month = 1 , day = 1 ) , dt . time ( hour = 0 , minute = 0 , second = 0 ) )
date = dtp . parse ( date , default = year )
r [ ' date ' ] = date
''' Check time '''
time = ir [ ' time ' ]
2018-08-14 09:08:19 +00:00
#print("1: {} {}".format(time, type(time)))
2018-08-14 08:54:02 +00:00
''' Deal with various formats that xlrd might give us. Note that floats should already be converted to tuples '''
if isinstance ( time , tuple ) :
time = dt . datetime . combine ( dt . date ( year = 1900 , month = 1 , day = 1 ) , dt . time ( hour = time [ 3 ] , minute = time [ 4 ] , second = time [ 5 ] ) )
2018-08-14 09:08:19 +00:00
#print("2: {} {}".format(time, type(time)))
elif isinstance ( time , float ) :
for fmt in ( ' % H. % M. % S ' , ' % M. % S ' , ) :
try :
time = dt . datetime . strptime ( time , fmt )
#print("3: {} {} {}".format(time, type(time), fmt))
except :
pass
2018-08-14 08:54:02 +00:00
elif isinstance ( time , str ) :
2018-08-14 09:08:19 +00:00
for fmt in ( ' % H: % M: % S ' , ' % M: % S ' , ' % H. % M. % S ' , ' % M. % S ' , ) :
2018-08-14 08:54:02 +00:00
try :
2018-08-14 09:08:19 +00:00
time = dt . datetime . strptime ( time , fmt )
#print("4: {} {} {}".format(time, type(time), fmt))
2018-08-14 08:54:02 +00:00
except :
2018-08-14 09:08:19 +00:00
pass
''' If time is still a string, ignore it, as it ' s probably blank/DNF '''
if isinstance ( time , str ) :
continue
if ' starttime ' in ir :
starttime = ir [ ' starttime ' ]
#print("Start 1: {} {}".format(starttime, type(starttime)))
if isinstance ( starttime , tuple ) :
starttime = dt . datetime . combine ( dt . date ( year = 1900 , month = 1 , day = 1 ) , dt . time ( hour = starttime [ 3 ] , minute = starttime [ 4 ] , second = starttime [ 5 ] ) )
#print("Start 2: {} {}".format(starttime, type(starttime)))
if isinstance ( starttime , float ) :
for fmt in ( ' % H. % M. % S ' , ' % M. % S ' , ) :
try :
starttime = dt . datetime . strptime ( starttime , fmt )
#print("Start 3: {} {} {}".format(starttime, type(starttime), fmt))
except :
pass
elif isinstance ( starttime , str ) :
for fmt in ( ' % H: % M: % S ' , ' % M: % S ' , ' % H. % M. % S ' , ' % M. % S ' , ) :
try :
starttime = dt . datetime . strptime ( starttime , fmt )
#print("Start 4: {} {} {}".format(starttime, type(starttime), fmt))
except :
pass
''' If starttime is still a string, ignore it, as it ' s probably blank '''
if not isinstance ( time , str ) and not isinstance ( starttime , str ) :
timedelta = time - starttime
time = dt . datetime . min + timedelta
#print("5: {} {}".format(time, type(time)))
2018-08-14 08:54:02 +00:00
r [ ' time ' ] = time . time ( )
''' Fix distance '''
2018-08-14 09:08:19 +00:00
length = re . search ( ' ([ \ d \ .,]+)(?: \ s*km)? ' , str ( ir . get ( ' distance ' ) ) , flags = re . IGNORECASE )
2018-08-14 08:54:02 +00:00
if length is not None :
2018-08-14 09:08:19 +00:00
distance = re . sub ( " , " , " . " , length . group ( 1 ) )
r [ ' distance ' ] = float ( distance )
2018-08-14 09:05:57 +00:00
else :
r [ ' distance ' ] = 0
2018-08-14 08:54:02 +00:00
''' Should be an int '''
for key in ( ' position ' , ' sexposition ' , ' catposition ' , ' age ' , ) :
val = ir . get ( key )
if val is not None :
try :
r [ key ] = int ( val )
except :
pass
''' Should be a string '''
2018-08-14 09:05:57 +00:00
for key in ( ' event ' , ' name ' , ' surname ' , ' licence ' , ' club ' , ' category ' , ) :
2018-08-14 08:54:02 +00:00
val = ir . get ( key )
2018-08-14 09:01:26 +00:00
if isinstance ( val , float ) :
val = int ( val )
2018-08-14 08:54:02 +00:00
if val is not None :
try :
2018-08-14 09:01:26 +00:00
r [ key ] = re . sub ( ' (^ \ s*| \ s*$) ' , ' ' , str ( val ) )
2018-08-14 08:54:02 +00:00
except :
pass
''' Leave alone '''
2018-08-14 09:01:26 +00:00
for key in ( ' event ' , ' source ' , ) :
2018-08-14 08:54:02 +00:00
r [ key ] = ir . get ( key )
2018-08-14 09:08:19 +00:00
''' Fix sex '''
if ' sex ' in r :
sex = ' sex ' in r and re . search ( ' ^ \ s*(F|M) ' , str ( r . get ( ' sex ' ) ) , flags = re . IGNORECASE )
if sex is not None :
if sex . group ( 1 ) == ' F ' :
r [ ' sex ' ] = ' F '
else :
r [ ' sex ' ] = ' M '
elif ' category ' in r :
sex = ' category ' in r and re . search ( ' ^ \ s*(F|M) ' , str ( r . get ( ' category ' ) ) , flags = re . IGNORECASE )
if sex is not None :
if sex . group ( 1 ) == ' F ' :
r [ ' sex ' ] = ' F '
else :
r [ ' sex ' ] = ' M '
''' Fix club '''
if re . search ( ' ^ \ s*(AAC$|Atlantic \ s*Athletic) ' , str ( r . get ( ' club ' ) ) , flags = re . IGNORECASE ) is not None :
r [ ' club ' ] = ' AAC '
2018-08-14 08:54:02 +00:00
rows . append ( r )
2018-08-14 09:08:19 +00:00
''' sort rows by position, then populate sexposition and catposition for each sex and category '''
sorted ( rows , key = lambda r : r [ ' position ' ] )
totals = defaultdict ( int )
for r in rows :
totals [ ' positions ' ] + = 1
if ' sex ' in r :
totals [ r [ ' sex ' ] ] + = 1
#log.debug("{} {}".format(totals[r['sex']], r['sex']))
if ' sexposition ' not in r :
r [ ' sexposition ' ] = totals [ r [ ' sex ' ] ]
if ' category ' in r :
totals [ r [ ' category ' ] ] + = 1
#log.debug("{} {}".format(totals[r['category']], r['category']))
if ' catposition ' not in r :
r [ ' catposition ' ] = totals [ r [ ' category ' ] ]
for i in rows :
r [ ' finishers ' ] = totals [ ' positions ' ]
if ' sex ' in r :
r [ ' sexfinishers ' ] = totals [ r [ ' sex ' ] ]
if ' category ' in r :
r [ ' catfinishers ' ] = totals [ r [ ' category ' ] ]
2018-08-14 08:54:02 +00:00
return rows
def parse_arguments ( ) :
parser = argparse . ArgumentParser ( description = ' Load a spreadsheet containing WPA results into a database ' )
parser . add_argument (
2018-08-14 09:06:55 +00:00
' --url ' , ' -u ' , action = ' append ' , required = False , dest = ' scrapeurl ' , nargs = " ? " ,
help = ' Scrape WPA website, or, if a link is specified, the spreadsheet at that link ' )
2018-08-14 09:05:57 +00:00
parser . add_argument (
2018-08-14 09:06:55 +00:00
' --calc ' , ' -c ' , action = ' append ' , required = False , dest = ' calculate ' , nargs = " ? " ,
help = ' Calculate unset positions in the database, optionally just for specified races ' )
2018-08-14 08:54:02 +00:00
parser . add_argument (
' --input ' , ' -i ' , action = ' store ' , required = False , type = str , dest = ' input_file ' ,
help = ' Manually select the spreadsheet to be imported ' )
parser . add_argument (
' --verbose ' , ' -v ' , action = ' count ' , required = False , dest = ' verbose ' ,
help = ' Print more information ' )
args = parser . parse_args ( )
if args . input_file :
if not os . path . exists ( args . input_file ) or not os . access ( args . input_file , os . R_OK ) :
raise
2018-08-14 09:05:57 +00:00
logging . basicConfig ( format = ' %(message)s ' )
2018-08-14 08:54:02 +00:00
if args . verbose is not None and args . verbose == 1 :
log . setLevel ( logging . INFO )
elif args . verbose is not None and args . verbose > = 2 :
log . setLevel ( logging . DEBUG )
else :
log . setLevel ( logging . WARNING )
return args
if __name__ == " __main__ " :
main ( )
# vim: set expandtab shiftwidth=2 softtabstop=2 tw=0 :