# Read structure and file into a MySQL database

# http://www.infobarrel.com/Facts_on_the_Enron_Meltdown
# 20,581 emails in these files.

import os, MySQLdb, random, string
#import re

dbsettings=dict(host = "localhost", user = "yourname", passwd = "yourpassword", db = "kiwipycon")
basepath='C:\yourpath\enron_flat'

conn = MySQLdb.connect (**dbsettings)
cursor = conn.cursor()

lvl1listing=os.listdir(basepath)
for lvl1 in lvl1listing:
	lvl1path=basepath+"\\"+lvl1
	lvl2listing=os.listdir(lvl1path)
	for lvl2 in lvl2listing:
		lvl2path=lvl1path+"\\"+lvl2
		lvl3listing=os.listdir(lvl2path)
		for filename in lvl3listing:
			file=open(lvl2path+"\\"+filename,'r')
			emailtext = file.read()
			emaillines = emailtext.splitlines()
			file.close()
			
			emailtext=emailtext.replace('X-bcc','CCSPLIT')
			emailtext=emailtext.replace('Mime-Version','CCSPLIT')
			emailtext=emailtext.replace('MIME-Version','CCSPLIT')
			
			messageID=emaillines[0][11:].strip()
			messageDate=emaillines[1][6:].strip()
			messageFrom=emaillines[2][6:].strip()
			messageToList=emailtext.partition('\nTo: ')[2].partition('Subject: ')[0].split()
			messageTo=''.join(messageToList)
			messageSubject=emailtext.partition('Subject: ')[2].partition('\n')[0].strip()
			
			if emailtext.find('\nCc: ') and emailtext.find('CCSPLIT: ') and (emailtext.find('\nCc: ')<emailtext.find('CCSPLIT: ')):
				messageCCList=emailtext.partition('\nCc: ')[2].partition('CCSPLIT: ')[0].split()
			else:
				messageCCList=[]
			
			messageCC=''.join(messageCCList)
			messageBCC=''.join(emailtext.partition('Bcc: ')[2].partition('X-From: ')[0].split())
			messageBody=emailtext.partition('X-FileName: ')[2].partition('\n')[2].strip()
			messageBody=messageBody.replace('\n','\r\n')
			
			FromInternal = True if messageFrom.partition('@')[2].lower()=='enron.com' else False
			ToHowMany = len(messageToList)
			ToHowManyInternal = ToHowManyExternal =0
			for recipient in messageToList:
				if recipient.partition('@')[2].strip(" ,").lower()=='enron.com':
					ToHowManyInternal+=1
				else:
					ToHowManyExternal+=1


			CCHowMany = len(messageCCList)
			CCHowManyInternal = CCHowManyExternal = 0
			for recipient in messageCCList:
				if recipient.partition('@')[2].strip(" ,").lower()=='enron.com':
					CCHowManyInternal+=1
				else:
					CCHowManyExternal+=1

			# Am using the term 'word' here pretty loosely, since split will break some human words into two or more items.
			# Is good enough for my current purposes, however.
			#body_wordlist=re.sub('[^\w&^\d]', ' ', messageBody)
			body_wordlist=messageBody.split()
			num_words_in_body=len(body_wordlist)
			perc_words_are_digits = perc_words_are_caps = float(0)
			
			fullchars=string.maketrans('','')
			dropchars=string.punctuation+string.whitespace
			for word in body_wordlist:
				cleanword=word.translate(fullchars,dropchars)
				if cleanword.isdigit(): perc_words_are_digits+=1 
				if cleanword.isupper(): perc_words_are_caps+=1
			
			if num_words_in_body >0:
				perc_words_are_digits=int(perc_words_are_digits/num_words_in_body*100)
				perc_words_are_caps=int(perc_words_are_caps/num_words_in_body*100)
			else:
				perc_words_are_digits = perc_words_are_caps = int(0)
				
			IsForwarded = True if messageSubject.lower().find('fw:')==0 else False
			IsReply = True if messageSubject.lower().find('re:')==0 else False
			
			random.seed()
			randnum = random.random()
			
			# My, isn't the following ugly?
			try:
				cursor.execute(\
					"insert into basedata (person, folder, email_dump, msg_id, msg_date, msg_from,"
					" msg_to, msg_subject, msg_cc, msg_bcc, msg_body, from_internal,"
					" to_howmany, to_howmany_internal, to_howmany_external,"
					" cc_howmany, cc_howmany_internal, cc_howmany_external,"
					" perc_words_are_digits, perc_words_are_caps, num_words_in_body,"
					" is_forwarded, is_reply, randnum ) "
					"values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
					(lvl1,
					 lvl2,
					 emailtext,
					 messageID,
					 messageDate,
					 messageFrom[:44],
					 messageTo,
					 messageSubject,
					 messageCC,
					 messageBCC,
					 messageBody,
					 FromInternal,
					 ToHowMany,
					 ToHowManyInternal,
					 ToHowManyExternal,
					CCHowMany,
					CCHowManyInternal,
					CCHowManyExternal,
					perc_words_are_digits,
					perc_words_are_caps,
					num_words_in_body,
					 IsForwarded,
					 IsReply,
					 randnum,
					 )
				)
				conn.commit()
			except:
				print "error: %s-%s-%s" % (lvl1,lvl2,filename)
	print lvl1
	

cursor.close()
conn.close()
