AACResults/load_spreadsheet.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
''' Utility to load WPA results sheets into a MySQL database.'''
__author__  = 'Timothy Allen'
__email__   = 'tim@allen.org.za'
__license__ = 'MIT'

import bs4
import urllib.request
import urllib.parse
import json
import mailbox
import email
import mimetypes
import csv
import xlrd
import MySQLdb
import MySQLdb.cursors
import argparse
import datetime as dt
import dateutil.parser as dtp
import logging
import os
import re
import sys
import tempfile
import pprint

# Set up MySQL database, if not done
# Read Excel/ODS/CSV database into MySQL
# Check MIME attachments in email for spreadsheet, and load that ***
# Then display the data in a (separate) web application

# The user is in /etc/dovecot/users
# Password is zi6ohYae0OeYie8eegei (not that you'll ever need it)
MAILDIR = '/var/mail/virtual/aac/Maildir'

log  = logging.getLogger(__file__)
pp = pprint.PrettyPrinter(indent=4)

def main():
  if sys.version_info < (3, 2):
    raise Exception(
      'Unsupported Python version, please use at least Python 3.2')

  args = parse_arguments()
  rows = []

  if args.calculate:
    #position_calculations('Winelands Marathon')
    position_calculations(args.calculate)
    return
  elif args.scrapeurl:
    requrls = args.scrapeurl
    spreadsheets = []
    uniqurl = []
    wpa = 'http://www.wpa.org.za/Events/DynamicEvents.asmx/BuildEventDisplay'
    for year in range(2016, dt.datetime.now().year + 1):
      log.debug("Finding results for {}".format(year));
      args = {"WPAExtra":"True","TimeColumn":"True","entityid":"674417","selectedyear":year,"selectedmonth":0,"commissionid":"0","selectedstate":"0","categoryid":0,"themeid":"46"}
      data = bytes(json.dumps(args).encode('utf8'))
      req = urllib.request.Request(wpa, data=data, headers={'content-type': 'application/json'})
      with urllib.request.urlopen(req) as response:
        data = json.loads(response.read().decode('utf8'))
        page, *_ = data.values() # get the first value
        soup = bs4.BeautifulSoup(page, 'html.parser')
        for event in soup.find_all('tr'):
          raceinfo = dict()
          link = event.find('a', href=re.compile('.xlsx?$'))
          name = event.find('td', class_=re.compile('EventHeadline'))
          date = event.find('td', class_=re.compile('EventDate'))
          dist = event.find('td', class_=re.compile('EventDist'), string=re.compile('^\s*[\d+\.]\s*(KM)?\s*$'))
          if link is not None and name is not None:
            if not link['href'] in uniqurl:
              uniqurl.append(link['href'])
              raceinfo['url']   = link['href']
              raceinfo['event'] = name.string
              raceinfo['date']  = dtp.parse(date.string, dayfirst=True)
              raceinfo['distance'] = None
              if dist is not None:
                raceinfo['distance'] = dist.string
              spreadsheets.append(raceinfo)
    for race in spreadsheets:
      url   = race['url']
      event = race['event']
      ''' Only parse one spreadsheet from the WPA website, from the commandline '''
      isthisevent = False
      for checkurl in requrls:
        if re.search(checkurl, event, flags=re.IGNORECASE):
          isthisevent = True
      if type(requrls[0]) != None and url not in requrls and not isthisevent:
        continue
      with urllib.request.urlopen(url) as response, tempfile.TemporaryDirectory() as tmpdir:
        log.info("Loading data from URL {}".format(race['url']))
        data = response.read()
        urlparts = urllib.parse.urlparse(url)
        filename = os.path.basename(urlparts.path)
        filepath = os.path.join(tmpdir, filename)
        with open(filepath, 'wb') as fp:
          fp.write(data)
          try:
            rows = read_spreadsheet(filepath, src=url, eventname=race['event'], eventdate=race['date'], eventdistance=race['distance'])
          except:
            log.warning("ERROR: Unable to load data from URL {}".format(url))
            raise
          else:
            load_into_db(rows)
            position_calculations(event)
      log.debug("\n")


  elif args.input_file:
    rows = read_spreadsheet(args.input_file, src=args.input_file)
    log.info("Loading data from file {}".format(args.input_file))
    load_into_db(rows)
    position_calculations()

  else:
    for message in mailbox.Maildir(MAILDIR):
      counter = 1
      for part in message.walk():
        if part.get_content_maintype() == 'multipart':
          continue
        filename = part.get_filename()
        ext = mimetypes.guess_extension(part.get_content_type())
        if not filename:
          if not ext:
            ext = '.xls' # attempt to decode as a spreadsheet
          filename = 'part-{:03}{}'.format(counter, ext)
        counter += 1
        if re.search('.xl(b|s)x?$', filename, flags=re.IGNORECASE) is not None:
          with tempfile.TemporaryDirectory() as tmpdir:
            filepath = os.path.join(tmpdir, filename)
            with open(filepath, 'wb') as fp:
              fp.write(part.get_payload(decode=True))
              log.info("Loading data from file {}".format(filename))
              try:
                rows = read_spreadsheet(filepath, src=message['from'])
              except:
                log.info("Unable to load data from file {}".format(filename))
                pass
              else:
                load_into_db(rows)
                position_calculations()

  return

def position_calculations(events=None):
  db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
                       use_unicode=True, charset="utf8", cursorclass=MySQLdb.cursors.DictCursor)
  c = db.cursor()

  where  = ''
  wheres = []
  if isinstance(events, list):
    for event in events:
      if event:
        wheres.append('event LIKE "%{}%"'.format(event))
  elif isinstance(events, str):
    wheres.append('event LIKE "%{}%"'.format(events))
  if wheres:
    where = 'WHERE ' + ' OR '.join(wheres)
  sql = 'SELECT event, date, distance FROM `results` {} GROUP BY event, date, distance'.format(where)
  c.execute(sql)
  #log.debug(c._last_executed)
  eventlist = [e for e in c.fetchall()]

  for race in eventlist:
    log.debug(race)

    log.debug("Recalculating postion information for {}".format(race['event']))
    ''' Calculate total finishers per race '''
    sql = 'UPDATE `results` AS r, (SELECT event, date, distance, COUNT(distance) AS finishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY event, date, distance) AS f SET r.finishers = f.finishers WHERE r.event = f.event AND r.date = f.date AND r.distance = f.distance AND r.finishers IS NULL;'.format(race['event'], race['date'], race['distance'])
    c.execute(sql)
    result = c.fetchall()

    ''' Update total finishers per sex per race '''
    sql = 'UPDATE `results` AS r, (SELECT event, date, distance, sex, COUNT(sex) as sexfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY sex) AS s SET r.sexfinishers = s.sexfinishers WHERE r.event = s.event AND r.date = s.date AND r.distance = s.distance AND r.sexfinishers IS NULL AND r.sex = s.sex;'.format(race['event'], race['date'], race['distance'])
    #print(sql)
    c.execute(sql)
    result = c.fetchall()

    ''' Update individual positions per sex per race '''
    c.execute('SET @rank = 0;')
    sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@sex = sex, @rank+1, 1) AS srank, @sex := sex FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND sex IS NOT NULL ORDER BY sex, position) AS s SET r.sexposition = s.srank WHERE r.result_key = s.result_key AND r.sexposition IS NULL;'.format(race['event'], race['date'], race['distance'])
    #print(sql)
    c.execute(sql)
    result = c.fetchall()

    ''' Update total finishers per category per race '''
    sql = 'UPDATE `results` AS r, (SELECT event, date, distance, category, COUNT(category) as catfinishers FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" GROUP BY category) AS c SET r.catfinishers = c.catfinishers WHERE r.event = c.event AND r.date = c.date AND r.distance = c.distance AND r.catfinishers IS NULL AND r.category = c.category;'.format(race['event'], race['date'], race['distance'])
    #print(sql)
    c.execute(sql)
    result = c.fetchall()

    ''' Update individual positions per category per race '''
    c.execute('SET @rank = 0;')
    sql = 'UPDATE `results` AS r, (SELECT result_key, position, @rank := IF(@cat = category, @rank+1, 1) AS crank, @cat := category FROM `results` WHERE event LIKE "{}" AND date = "{}" AND distance LIKE "{}" AND category IS NOT NULL ORDER BY category, position) AS c SET r.catposition = c.crank WHERE r.result_key = c.result_key AND r.catposition IS NULL;'.format(race['event'], race['date'], race['distance'])
    #print(sql)
    c.execute(sql)
    result = c.fetchall()

  db.commit()
  return


def load_into_db(rows):
  '''
    CREATE TABLE `results` (
      `result_key` int(11) NOT NULL AUTO_INCREMENT,
      `date` datetime DEFAULT NULL,
      `distance` decimal(4,2) DEFAULT NULL,
      `event` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
      `position` int(5) NOT NULL,
      `finishers` int(5) DEFAULT NULL,
      `time` time NOT NULL,
      `name` varchar(75) COLLATE utf8_unicode_ci DEFAULT NULL,
      `surname` varchar(75) COLLATE utf8_unicode_ci DEFAULT NULL,
      `licence` varchar(20) COLLATE utf8_unicode_ci DEFAULT NULL,
      `club` varchar(40) COLLATE utf8_unicode_ci DEFAULT NULL,
      `age` int(3) DEFAULT NULL,
      `sex` varchar(10) COLLATE utf8_unicode_ci DEFAULT NULL,
      `sexposition` int(5) NOT NULL,
      `sexfinishers` int(5) DEFAULT NULL,
      `category` varchar(15) COLLATE utf8_unicode_ci DEFAULT NULL,
      `catposition` int(5) NOT NULL,
      `catfinishers` int(5) DEFAULT NULL,
      `source` varchar(200) COLLATE utf8_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`result_key`)
    ) ENGINE=InnoDB CHARSET=utf8 COLLATE=utf8_unicode_ci;
  '''
  if rows is None or len(rows) < 1:
    log.warning("**** No data found in spreadsheet ****")
  else:
    db = MySQLdb.connect(user='aac', passwd='saOAcCWHg4LaoSSA', db='AAC',
                         use_unicode=True, charset="utf8")
    c =  db.cursor()

    ''' Check for duplicate values by DATE and POSITION and RACE and EVENT '''
    sql = 'SELECT COUNT(*) FROM `results` WHERE source LIKE %s'
    c.execute(sql, (rows[0].get('source'),))
    log.debug(c._last_executed)
    if (c.fetchone()[0] > 0):
      log.info("Spreadsheet data already loaded")
      return

    sql = 'SELECT COUNT(*) FROM `results` WHERE date=%s AND position=%s AND distance LIKE %s AND event LIKE %s'
    c.execute(sql, (rows[0].get('date'), rows[0].get('position'), rows[0].get('distance'), rows[0].get('event'),))
    log.debug(c._last_executed)
    if (c.fetchone()[0] > 0):
      log.info("Spreadsheet data already loaded")
      return

    for r in rows:
      fields = ', '.join(r.keys())
      values = ', '.join(['%s'] * len(r)) # placeholder values
      sql = 'INSERT into `results` ( {} ) VALUES ( {} )'.format(fields, values)
      try:
        c.execute(sql, r.values())
      except :
        e = sys.exc_info()[0]
        log.debug("ERROR: {}".format(e))
        log.debug("Last query was: {}".format(c._last_executed))
        raise
        #pass

    db.commit()

  return

def read_spreadsheet(spreadsheet, src=None, eventname=None, eventdate=None, eventdistance=None):
  rows = []
  filename = os.path.basename(spreadsheet)
  if re.search('.xlsx?$', spreadsheet, flags=re.IGNORECASE) is not None:
    book = xlrd.open_workbook(spreadsheet)
    for sheetname in book.sheet_names():
      sheet = book.sheet_by_name(sheetname)
      log.debug("Processing sheet {}".format(sheetname))

      ''' Look for the header in the first 15 rows, searching from the top '''
      fields = []
      for row in range(0, 15):
        try:
          if re.search('((pos\w*|no\w*|num\w*|surname|name|time|club)\s*){2,}', ' '.join(str(x) for x in (sheet.row_values(row))), flags=re.IGNORECASE) is not None:
            fields = sheet.row_values(row)
            log.debug("Spreadsheet fields: {}".format(', '.join(str(x) for x in fields)))
            break
        except:
          ''' Probably a blank sheet, let's skip '''
          continue
      ''' Translate field names, and delete unwanted fields '''
      position_idx = None
      time_idx = None
      for i in range(len(fields)):
        if re.search('^\s*pos', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'position'
          ''' Store the index of this field for later processing '''
          position_idx = i
        elif re.search('^\s*(time|h:?m:?s?)', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'time'
          ''' Store the index of this field for later processing '''
          time_idx = i
        elif re.search('^\s*cat\S*\s*pos(\.|\w+)?\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'catposition'
        elif re.search('^\s*(sex|gender)\s*pos(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'sexposition'
        elif re.search('^\s*(sur|last\s*)name', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'surname'
        elif re.search('^\s*name', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'name'
        elif re.search('^\s*club(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'club'
        elif re.search('^\s*age(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'age'
        elif re.search('^\s*(sex|gender|m.?f|male|female)(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'sex'
        elif re.search('^\s*cat(\.|\w*)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'category'
        elif re.search('^\s*(lic|no|num)(\.|\S*)\s*\S*\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'licence'
        elif re.search('^\s*(race)?date', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'date'
        elif re.search('^\s*(race)?dist(ance)?\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'distance'
        elif re.search('^\s*(race)?(event|name)\s*$', str(fields[i]), flags=re.IGNORECASE) is not None:
          fields[i] = 'event'
          pass

      ''' If there isn't a position field or a time field, we don't want this sheet '''
      if position_idx is None or time_idx is None:
        continue

      ''' Look for the date in the file name, and then look the first 15 rows and override it '''
      if eventdate is None:
        eventdate = dt.datetime.min
        filedate = re.search('(20\d{2})', str(filename), flags=re.IGNORECASE)
        if filedate is not None:
          eventdate = filedate.group(1)
        for row in range(0, 15):
          sheetdate = re.search('(\d+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})', str(sheet.cell(row, 0).value), flags=re.IGNORECASE)
          if sheetdate is not None:
            eventdate = sheetdate.group(1)
            break
      log.info("Race date: {}".format(eventdate))
      #log.info("Race date: {}".format(eventdate.strftime('%Y-%m-%d')))

      if eventname is None:
        ''' Use the filename as the event name :-( '''
        eventname, *_ = os.path.splitext(filename)
        eventname = str(eventname)
        ''' Clean up common patterns '''
        eventname = re.sub('[\-_]', ' ', eventname, flags=re.IGNORECASE)
        eventname = re.sub('results?(\s*book)?', ' ', eventname, flags=re.IGNORECASE)
        eventname = re.sub('export', ' ', eventname, flags=re.IGNORECASE)
        eventname = re.sub('excel', ' ', eventname, flags=re.IGNORECASE)
        eventname = re.sub('\(\d\)', ' ', eventname, flags=re.IGNORECASE)
        eventname = re.sub('\d{0,4}20\d{2}\d{0,4}', ' ', eventname, flags=re.IGNORECASE)
        eventname = re.sub('\s\s+', ' ', eventname, flags=re.IGNORECASE)
      eventname = re.sub('(^\s*|\s*$)', '', eventname, flags=re.IGNORECASE)
      log.info("Event name: {}".format(eventname))

      ''' Look for the race distance in the sheet name, or in the filename '''
      distance = eventdistance
      filedistance = re.search('([\d\.]+)\s*KM', filename, flags=re.IGNORECASE)
      if filedistance is not None:
        distance = filedistance.group(1)
      eventnamedistance = re.search('([\d\.]+)\s*KM', eventname, flags=re.IGNORECASE)
      if eventnamedistance is not None:
        distance = eventnamedistance.group(1)
      sheetdistance = re.search('([\d\.]+\s*KM)', sheetname, flags=re.IGNORECASE)
      if sheetdistance is not None:
        distance = sheetdistance.group(1)
      sheetdistance = re.search('(helper|marshal)', sheetname, flags=re.IGNORECASE)
      if sheetdistance is not None:
        distance = sheetname
      log.info("Race distance: {}".format(distance))

      for row in range(sheet.nrows):
        ''' TODO: don't assume that the position is the first cell '''
        if re.search('(^\s*$|[A-Za-z])', str(sheet.cell(row, position_idx).value), flags=re.IGNORECASE) is None:
          item = dict()
          data = []
          for col in range(sheet.ncols):
            data.append(sheet.cell(row, col).value)
          item = dict(zip(fields, data))
          ''' If the time has been modified by Excel, unmodify it '''
          if 'time' in item and isinstance(item['time'], float):
            try:
              item['time'] = xlrd.xldate_as_tuple(sheet.cell(row, time_idx).value, book.datemode)
            except:
              ''' Skip this row if the date can't be parsed, as it's probably wrong anyway (41 hours or something silly) '''
              continue
          item['date']      = eventdate
          item['event']     = eventname
          item['distance']  = distance
          item['source']    = src
          rows.append(item)

    rows = clean_data(rows)
    if len(rows) > 0:
      log.debug("Sample output: {}".format(pp.pformat(rows[0])))
    return rows


def clean_data(input_rows):
  rows = []
  for ir in input_rows:
    r = dict()
    ''' Fix date '''
    date = ir.get('date')
    if isinstance(date, str):
      today = dt.datetime.now()
      year = dt.datetime.combine(dt.date(year=today.year, month=1, day=1), dt.time(hour=0, minute=0, second=0))
      date = dtp.parse(date, default=year)
    r['date'] = date

    ''' Check time '''
    time = ir['time']
    ''' Deal with various formats that xlrd might give us. Note that floats should already be converted to tuples '''
    if isinstance(time, tuple):
      time = dt.datetime.combine(dt.date(year=1900, month=1, day=1), dt.time(hour=time[3], minute=time[4], second=time[5]))
    elif isinstance(time, str):
      try:
        time = dt.datetime.strptime(time, '%H:%M:%S')
      except:
        try:
          time = dt.datetime.strptime(time, '%M:%S')
        except:
          continue
    r['time'] = time.time()

    ''' Fix distance '''
    length = re.search('([\d\.]+)\s*km', str(ir.get('distance')), flags=re.IGNORECASE)
    if length is not None:
      r['distance'] = length.group(1)
    else:
      r['distance'] = 0

    ''' Fix sex '''
    if 'sex' in ir:
      sex = 'sex' in ir and re.search('^\s*(F|M)', str(ir.get('sex')), flags=re.IGNORECASE)
      if sex is not None:
        if sex.group(1) == 'F':
          r['sex'] = 'F'
        else:
          r['sex'] = 'M'

    ''' Fix club '''
    if re.search('^\s*(AAC\b|Atlantic\s*Athletic)', str(ir.get('club')), flags=re.IGNORECASE) is not None:
      r['club'] = 'AAC'

    ''' Should be an int '''
    for key in ( 'position', 'sexposition', 'catposition', 'age', ):
      val = ir.get(key)
      if val is not None:
        try:
          r[key] = int(val)
        except:
          pass

    ''' Should be a float '''
    for key in ( 'distance', ):
      val = ir.get(key)
      if val is not None:
        try:
          r[key] = float(val)
        except:
          pass
      else:
        r[key] = 0

    ''' Should be a string '''
    for key in ( 'event', 'name', 'surname', 'licence', 'club', 'category', ):
      val = ir.get(key)
      if isinstance(val, float):
        val = int(val)
      if val is not None:
        try:
          r[key] = re.sub('(^\s*|\s*$)', '', str(val))
        except:
          pass

    ''' Leave alone '''
    for key in ( 'event', 'source', ):
      r[key] = ir.get(key)

    rows.append(r)
  return rows


def parse_arguments():
  parser = argparse.ArgumentParser(description='Load a spreadsheet containing WPA results into a database')
  parser.add_argument(
    '--url', '-u', action='append', required=False, dest='scrapeurl', nargs="?",
    help='Scrape WPA website, or, if a link is specified, the spreadsheet at that link')
  parser.add_argument(
    '--calc', '-c', action='append', required=False, dest='calculate', nargs="?",
    help='Calculate unset positions in the database, optionally just for specified races')
  parser.add_argument(
    '--input', '-i', action='store', required=False, type=str, dest='input_file',
    help='Manually select the spreadsheet to be imported')
  parser.add_argument(
    '--verbose', '-v', action='count', required=False, dest='verbose',
    help='Print more information')
  args = parser.parse_args()

  if args.input_file:
    if not os.path.exists(args.input_file) or not os.access(args.input_file, os.R_OK):
      raise

  logging.basicConfig(format='%(message)s')
  if args.verbose is not None and args.verbose == 1:
    log.setLevel(logging.INFO)
  elif args.verbose is not None and args.verbose >= 2:
    log.setLevel(logging.DEBUG)
  else:
    log.setLevel(logging.WARNING)

  return args

if __name__ == "__main__":
    main()

# vim: set expandtab shiftwidth=2 softtabstop=2 tw=0 :