#!/usr/bin/python
# -*- coding: utf-8  -*-
# unicode header in previous line

# CPBL, 2009/2010

import re
import sys
from cpblUtilities import uniqueInOrder,doSystem,orderListByRule,dgetget,flattenList
from pylab import *

##############################################################################
##############################################################################
#
def wgetUBC(url):  # Note: Since I'm not logging in with cookies like for parkers, this is generic
    #
    ##########################################################################
    ##########################################################################
    """Grab a page and extract certain infos"""
    import urllib
    import urllib2
    import re  # Regular expressions
    from time import sleep
    #user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {}# 'User-Agent' : user_agent }
    data={}#"source":'',

    htmlSource=[]
    while not htmlSource:
        try:
            htmlSource = urllib2.urlopen(url,urllib.urlencode(data)).read()#,headers)
        except: # urllib2.HTTPError, e:
            print('Caught an error reading URL...HTTP?!')
            htmlSource=[]
            sleep(5)
            #if e.code == 401:
            #    dlog('HTTP ERROR!!: not authorized')
            #elif e.code == 404:
            #    dlog('HTTP ERROR!!: not found')
            #elif e.code == 503:
            #    dlog('HTTP ERROR!!: service unavailable')
            #else:
            #    dlog('HTTP ERROR!!: unknown error: ')
        if not htmlSource:
            print('Failed to open url'+url)
    
    htmlSource=re.sub("\n","",str(htmlSource)).replace('\r','')  # Remove newlines and ^M's
    # Remove all newlines and all tabs from html. Later I use tabs for CSV
    return(str(htmlSource).expandtabs())


########################################################################
def getDepartments():
########################################################################
    allDepts=wgetUBC("https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0")
    deptsList=re.findall("""<tr class=section.>[ \n]*<td>[ \n]*<a href="([^"]*)">([^<]*)</a>[ \n]*</td>[ \n]*<td nowrap>([^<]*)</td>[ \n]*<td nowrap>([^<]*)</td>[ \n]*</tr>""",allDepts,re.DOTALL)
    depts=dict([[dd[1],{'www':dd[0],'faculty':dd[3],'name':dd[2].strip()}] for dd in deptsList])  #"
    assert 'RMES' in depts
    return(depts)

allKnownSectionTypes=[]
########################################################################
def parseInstructor(instr):
########################################################################
    print 'Looking up ',instr[1]
    url=instr[0]
    if url.startswith("/cs"):#https://courses.students.ubc.ca"):
        url="https://courses.students.ubc.ca"+url
    oneInstructor=wgetUBC(url)
    taughtSections=re.findall("""<tr class=section.><td>[^<]*</td>[ \n]*<td nowrap>[ \n]*<a href="([^"]*)"[^>]*>([^<]*)</a>[^<]*</td>[^<]*<td nowrap>([^<]*)</td>""",oneInstructor,re.DOTALL)
    insInfo={}
    email=re.findall("""<tr><td>Email:</td><td>([^>]*)</td></tr>""",oneInstructor,re.DOTALL)
    if email:
        assert len(email)==1
        insInfo={'email':email[0]}
    sectionTypes=uniqueInOrder([ll[2] for ll in taughtSections if ll[2]])
    types=dict([[''.join([cc for cc in st if cc.isalpha()]),len([ll for ll in taughtSections if ll[2]==st])] for st in sectionTypes      ])
    #nLecs=len([ll for ll in taughtSections if ll[2]=='Lecture'])
    types.update({'name':instr[1],'sectionsTaught2009':len(taughtSections)})

    for tt in types:
        if not tt in allKnownSectionTypes:
            cursor.execute("Show columns from instructors like '%s'"%tt)
            if not cursor.rowcount:
                cursor.execute('ALTER TABLE instructors ADD '+tt+' INT DEFAULT "0"')
                print ' ***  ADDED A NEW FIELD TO THE TABLE!!!!!!'+tt

    print " %s teaches %d courses! "%(instr[1],len(taughtSections)),types
    return(types,insInfo)#,'sectionsTaught2009':len(taughtSections),'lectureSectionsTaught2009':nLecs}) #" 'www':url,


########################################################################
def lookupStaff(lastName,firstName,email=None): # Get department and appointment!
########################################################################
    """
to do: use email address if can't find exact!
https://www.directory.ubc.ca/index.cfm?email=itgault@law.ubc.ca
"""

    def checkSearchResult(txt):
        allexacts=re.findall("""<h3>(Exact Matches[^<]*</h3>.*?)</table>""",txt,re.DOTALL)
        if allexacts and not 'Yellow Pages' in allexacts[0]:
            exacts=re.findall(r"""<a href='index.cfm.page=personDetail&row=\d*'>([^<]*)</a></p>\s*</td>\s*<td width='25%'>(.*?)</td>""",allexacts[0],re.DOTALL)
            if len(exacts)>1:
                print "Found more than one exact name match!! "
                return(False)
            return(exacts)
        return(False)
            
    exacts=checkSearchResult(wgetUBC("https://www.directory.ubc.ca/index.cfm?firstName=%s&lastName=%s"%(firstName,lastName)))
    if not exacts and email:
        print ' Trying to match by email......................'+email
        exacts=checkSearchResult(wgetUBC("https://www.directory.ubc.ca/index.cfm?email="+email))
    if not exacts: # Try without initials after first name
        print ' Trying to matchng by stripping initials ',firstName.split(' ')[0],lastName
        exacts=checkSearchResult(wgetUBC("https://www.directory.ubc.ca/index.cfm?firstName=%s&lastName=%s"%(firstName.split(' ')[0],lastName)))
    if not exacts: # Try just last name
        print ' Trying to matchng by last name .......'+lastName
        exacts=checkSearchResult(wgetUBC("https://www.directory.ubc.ca/index.cfm?lastName=%s"%(lastName)))
    if not exacts:
        return({})


    # Get department and appointment:
    appt,dept=re.findall(r"""\s*<p>([^<]*)<.*?<a href=[^>]*>([^<]*)</a>""",exacts[0][1],re.DOTALL)[0]
    return({'department':dept,'appointment':appt.strip(),'asstProf':int('Assistant Professor' in appt) })


########################################################################
def recheckAllStaff(result_set,deptsList=None): # repeat staff lookup for all unmatched names
    """
aghghghghgh i cannot do email trick for this, yet.  I cannot figure out how to fill out the staff/dept search form for courses.
"""
########################################################################
    for row in result_set:
        if row.get('department',''):
            continue
        sN,fN=row['name'].split(',')
        position=lookupStaff(sN.strip(),fN.strip())
        if not position:
            print 'Still failed to find ',row['name']
            continue
        kk=position.keys()
        sql="UPDATE instructors SET "+' , '.join([""" `%s`="%s" """%(kkk,str(position[kkk])) for kkk in kk if not kkk=='name'])+' WHERE instructors.name="'+row['name']+'"'
        print sql
        if cursor:
            try:
                cursor.execute(sql)#,record.values())
                id=cursor.lastrowid
                print('   Created new record %d'%id)
            except MySQLdb.Error, e:
                print "Error %d: %s" % (e.args[0], e.args[1])
                foooo
                sys.exit (1)
                

########################################################################
def analyseResults(result_set,deptsList=None):
########################################################################
    from dictTrees import dictTree
    # Clean up lookup of profs:
    for pp in result_set:
        appt1=pp['appointment'].replace('  ',' ')
        replacements=[
['Professor','Professor'],
['Prof,','Professor'],
#['Director & Professor','Professor'], # No... that's an obvious other responsibilities, so ignore the "& Professor"s
['Assistant Prof','Assistant Professor'],
['Asst Prof','Assistant Professor'],
["Ass't Prof",'Assistant Professor'],
['Assoc Prof','Associate Professor'],
['Associate Professor','Associate Professor'],
['Sessional','Sessional'],
['Adjunct Professor','Adjunct Professor'],
]

        for rr in replacements:
            if appt1.startswith(rr[0]):
                pp['appointment2']=rr[1]
                continue

        if appt1 and 'appointment2' not in pp:
            print 'What is ',appt1
        if 'appointment2' not in pp:
            pp['appointment2']='unknown'

        pp['rsectionsTaught2009']=pp['sectionsTaught2009']-pp.get('WaitingList',0)

    ins=dictTree(list(result_set),['department','appointment2'])

    displaySumNames=['rsectionsTaught2009','Lecture','Seminar']#Lecture-or-Seminar']

    deptsListCode=dict([[deptsList[dl]['name'],deptsList[dl]] for dl in deptsList])

    def meanetc(alist,kk):
        subset=[float(oneins.get(kk,0)) for oneins in list(alist) if kk in oneins]
        subset=[ff for ff in subset if not isnan(ff)]
        if not subset:
            return([NaN,0,NaN,NaN])
        assert not isnan(mean(subset))
        return([mean(subset),len(subset),min(subset),max(subset)])



    for appt in ['Assistant Professor', 'Associate Professor', 'Professor',]:
        fout=open('/home/cpbl/tmp/teaching-%s.tsv'%(appt.replace(' ','')),'wt')
        fout.write('\t'.join(['code','department','faculty',]+flattenList([[dsn,'N','min','max',] for dsn in displaySumNames]))+'\n')
        
        if '' in ins:
            ins['unidentified']=ins[''] # But everyone in an unidentified department will have "unknown" appointment type, since we couldn't look them up.
        for dept in ins.keys():
            if not dept:
                pass
            code=dgetget(deptsListCode, dept.split('(')[0].strip() ,'department','')

            #for oneprof in dgetget(ins,dept,appt,[]):
            #    onerecord=ins[dept][appt][oneprof]
            #    ins[dept][appt][oneprof]['Lecture-or-Seminar']=onerecord('Lecture',0)+dgetget(ins,dept,appt,'Seminar',0)
            meansBy=flattenList([meanetc(ins[dept].get(appt,[]),kk) for kk in displaySumNames])
            print meansBy
            fout.write('\t'.join([code,dept,dgetget(deptsList,dept,'faculty','')])+'\t'+'\t'.join([str(cc) for cc in meansBy])+'\n')
        fout.close()

########################################################################
########################################################################
########################################################################
# Main here... open database, then recursively cycle through all...
########################################################################
########################################################################
########################################################################


import MySQLdb
cursor=None
if 1:
    db = MySQLdb.connect("localhost", "", "", "")#,init_command='SET NAMES utf8')

    # Start by exporting the table to date:
    # Use the fancier, dict-based cursor for this (replaced below with plain version)
    cursor = db.cursor (MySQLdb.cursors.DictCursor)
    cursor.execute ("SELECT * FROM instructors")
    result_set = cursor.fetchall()
    kk=[]
    fout=open('/home/cpbl/tmp/instructorsMySQL.tsv','wt')
    for row in result_set:
        if not kk:
            kk=orderListByRule(row.keys(),['name','department','sectionsTaught2009','lectureSectionsTaught2009','asstProf','appointment','LabSeminar','Discussion','WaitingList','Lecture','FieldTrip','Rehearsal','ProblemSession','EssayReport','Studio','Project','DirectedStudies','ReservedSection','Practicum','LectureLaboratory','Tutorial','Thesis','DistanceEducation','Laboratory','Seminar'])
            print 'Using keys ',kk
            fout.write('\t'.join(kk)+'\n')
        fout.write(('\t'.join([str(row[kkk]) for kkk in kk])).replace('\tNone\t','\t\t')+'\n')
    fout.close()

    cursor.execute ("SELECT * FROM departmentsCompleted")
    depts_set = cursor.fetchall()
    deptsL=dict([[dd['department'],dd] for dd in list(depts_set)])
    if 0:
        recheckAllStaff(result_set,deptsList=deptsL)

    analyseResults(result_set,deptsList=deptsL)

    cursor = db.cursor()
    #cursor.execute('SET CHARACTER SET utf8')
    #cursor.execute('SET character_set_connection=utf8')

    #cursor.execute(""" SELECT "ALL" FROM instructors INTO OUTFILE '/tmp/instructorsMySQL.tsv' FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' FOR UPDATE """)


depts=getDepartments()
allSectionTypes=[]



instructors={}
coursesWithoutInstructor=0
for dept in orderListByRule(sorted(depts),['RMES']):
    coursesWithoutInstructor=0
    cursor.execute('SELECT department FROM departmentsCompleted WHERE department="'+dept+'" and done2009=1')
    NEWDEPT=cursor.rowcount==0
    if not NEWDEPT:
        print 'No need to go through department '+dept+ ' for 2009-2010 academic year '
        if 0 and cursor:
            sql='UPDATE departmentsCompleted SET name="%s",faculty="%s" WHERE department="%s"'%(dgetget(depts,dept,'name',''),dgetget(depts,dept,'faculty',''),dept)
            cursor.execute(sql)#,record.values())

        continue
    # Start by finding all instructors who are teaching any lectures:
    print 'Looking up %s'%dept
    oneDept=wgetUBC("https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept="+dept)
    oneDeptCourses= re.findall("""<tr class=section.><td nowrap><a href="([^"]*)">([^<]*)</a></td><td nowrap>([^<]*)</td></tr>""",oneDept,re.DOTALL) #"

    for acourse in oneDeptCourses:
        print 'Looking up '+acourse[1]
        oneCourse=wgetUBC(acourse[0])
        taughtSections=re.findall("""<tr class=section.><td>[^<]*</td>\s*<td nowrap>\s*<a href="([^"]*)"[^>]*>([^<]*)</a>[^<]*</td>[^<]*<td nowrap>([^<]*)</td>""",oneCourse,re.DOTALL)

        for section in taughtSections:
             if section[2]=='Lecture' or not any([ss[2]=='Lecture' for ss in taughtSections]):
                  print 'Looking up section ',section[1].strip(),section[2]
                  oneLecture=wgetUBC(section[0])
                  # Caution! There can be multiple instructors for the course!
                  instructorText=re.findall("""<tr>[ \n]*<td nowrap>Instructor.*?: *</td>(.*?)</table>""",oneLecture,re.DOTALL)
                  if not instructorText:
                      coursesWithoutInstructor+=1
                      print "   No instructor for this section!"
                  else:
                      instructorLinks=re.findall("""<a href="([^"]*)">([^<]*)</a></td>""",instructorText[0],re.DOTALL)
                      assert len(instructorLinks)>=1
                      for instructor in instructorLinks:
                          who=instructor[1].strip()

                          # Check whether a record already exists for the instructor of this course.
                          if ',' in who:
                              if cursor:
                                  cursor.execute("SELECT name FROM instructors WHERE name=%s",(who))
                                  INSERTNEW=cursor.rowcount ==0
                              else:
                                  INSERTNEW=0
                              if who not in instructors or (not cursor and who not in instructors) or INSERTNEW: # Insert a new record or proceed with analysis if database is off
                                  # if who.strip() not in instructors:
                                  sN,fN=who.split(',')
                                  instructors[who],insInfo=parseInstructor(instructor)
                                  position=lookupStaff(sN.strip(),fN.strip(),email=insInfo.get('email',''))
                                  allSectionTypes=uniqueInOrder(allSectionTypes+instructors[who].keys())
                                  instructors[who].update(position)###{'appointment':position,'AsstProf':'Assistant Professor' in position,'dept':dept})
                                  #instructors[who].update({'AsstProf':int('Assistant Professor' in position.get('appt','')})
                                  print instructors[who]
                                  kk=instructors[who].keys()
                                  if not INSERTNEW:
                                      sql="UPDATE instructors SET "+' , '.join([""" `%s`="%s" """%(kkk,str(instructors[who][kkk])) for kkk in kk if not kkk=='name'])+' WHERE instructors.name="'+str(instructors[who]['name'])+'"'
                                  else:
                                      sql="INSERT INTO instructors ("+','.join(kk) +") VALUES ("+','.join(['"'+str(instructors[who][kkk])+'"' for kkk in kk])+")"
                                  print sql
                                  if cursor:
                                      try:
                                          cursor.execute(sql)#,record.values())
                                          id=cursor.lastrowid
                                          print('   Created new record %d'%id)
                                      except MySQLdb.Error, e:
                                          print "Error %d: %s" % (e.args[0], e.args[1])
                                          foooo
                                          sys.exit (1)


                              else:
                                  print 'Already know about ',who
                          else:
                              print 'Weird instructor: not a person? ',who
    print '   ALL KNOWN SECTION TYPES SO FAR:',  allSectionTypes 
    sql='INSERT INTO departmentsCompleted (department,done2009,coursesWithoutInstructor,name,faculty) VALUES ("%s",1,"%d","%s","%s")'%(dept,coursesWithoutInstructor,dgetget(depts,dept,'name',''),dgetget(depts,dept,'faculty',''))

    if not NEWDEPT:
        sql='UPDATE departmentsCompleted SET department="%s",done2009="1",coursesWithoutInstructor="%d",name="%s",faculty="%s" WHERE department="%s"'%(dept,coursesWithoutInstructor,dgetget(depts,dept,'name',''),dgetget(depts,dept,'faculty',''),dept)

    print sql
    if cursor:
        try:
            cursor.execute(sql)#,record.values())
            id=cursor.lastrowid
            print ' FINISHED DEPT '+dept
        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
            foooo
            sys.exit (1)


cursor.close ()
db.commit ()
db.close ()


