#!/usr/bin/env python #Copyright (c) 2008 Jeff Bryner #python script to gather gmail artifacts from a pd process memory dump #example: # #on windows box, use pd from www.trapkit.de ala: #pd -p 1234> 1234.dump # #where 1234 is a running instance of IE # #on linux box do: #strings -el 1234.dump> memorystrings.txt #pdgmail -f memorystrings.txt # #It'll find what it can out of the memory image including contacts, emails, last acccess times, IP addresses etc. #This program is free software; you can redistribute it and/or modify it under #the terms of the GNU General Public License as published by the Free Software #Foundation; either version 2 of the License, or (at your option) any later #version. #This program is distributed in the hope that it will be useful, but WITHOUT #ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. #You should have received a copy of the GNU General Public License along with #this program; if not, write to the Free Software Foundation, Inc., #59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import sys import os import types import struct from time import ctime import getopt import array import re safestringre=re.compile('[\x80-\xFF]') ipre=re.compile('(?:\d{1,3}\.){3}\d{1,3}') gmailcontactre=re.compile('(?:\[\"ct.*\])') gmailmere=re.compile('(?:\[\"me.*\])') gmailmessagere=re.compile('(?:\[\"ms\".*\])',re.MULTILINE) gmailmessagere2=re.compile('(?:\[\"mb\".*\])',re.MULTILINE) gmaillastaccessre=re.compile('(?:\[\"la.*\])') fromre=re.compile('(?:[Ff]rom\s.+\/)') def safestring(badstring): """makes a good strings out of a potentially bad one by escaping chars out of printable range""" return safestringre.sub(lambda c: 'char#%d;' % ord(c.group(0)),badstring) def parseOptions(): options = {'file' :'', 'verbose' : False, 'bodies' : True } helpstr = 'Usage: ' + sys.argv[0] + ' [OPTIONS]' + """\n Options: -f, --file the file to use (stdin if no file given) -b, --bodies don't look for message bodies (helpful if you're getting too many false positives on the mb regex) -h, --help prints this -v,--verbose be verbose (prints filename, other junk) -V,--version prints just the version info and exits. This expects to be unleashed on the result of running strings -el on a pd dump from windows process memory. Anything other than that, your mileage will certainly vary.\n \n """ optlist, args = getopt.getopt(sys.argv[1:], 'vhbf:V', ['help','file=','version','verbose','bodies']) #parse options. for o, a in optlist: if (o == '-h' or o == '--help'): print helpstr sys.exit() elif (o == '-v' or o == '--verbose'): options['verbose']=True elif (o == '-b' or o == '--bodies'): options['bodies']=False elif (o == '-V' or o == '--version'): print "pdgmail version 0.2.0 Jeff Bryner" sys.exit() else: for option in options.keys(): execcode = "if (o == '-%s' or o == '--%s'): options['%s'] = a" % (option[0], option, option) exec execcode return options def gatherArtifacts(): filedata="" gmailacctme="" if options["verbose"]: print "FileName: %s " % options["file"] try: if options["file"]!='': fileHandle = open(options["file"], mode='r') fileHandle.close() except IOError: sys.stderr.write('Cannot open file\n') sys.exit(1) #read in the stdin/file if options["file"] != '': fp = open(options['file'], 'r') filedata = fp.read() fp.seek(0) #look for gmail contact records: try: while 1: if options["file"] != '': line = safestring(fp.readline()) else: line = safestring(sys.stdin.readline()) #we're reading stdin. Messages cross more than one line, so messily concat lines back into a filedata blob for use later. filedata +=line if not line: break gmailcontacts=gmailcontactre.findall(line) if len(gmailcontacts)>0: #we are handling a contact record, should look like this: ["ct","contactname","emailaddress@gmail.com",0,"3"] #I've no idea what the numbers are, the names we parse out neatly if possible for ct in gmailcontacts: try: #convert it to a list and print out the subsections ctList=[] ctList=ct.replace('[','').replace(']','').split(',') print "contact: name: %s email: %s" %(ctList[1],ctList[2]) except: print "raw contact: " + line.strip() gmailmes=gmailmere.findall(line) if len(gmailmes)>0: #we are handling a "me" record, should look like this: ["me","someemail@gmail.com"] for me in gmailmes: try: #convert it to a list and print out the subsections meList=[] meList=me.replace('[','').replace(']','').split(',') print "gmail account: email: %s" %(meList[1]) gmailacctme=str(me) if options["verbose"]: print "gmail me record:" + str(me) except: print "raw gmail account: " + line.strip() gmaillastaccesses=gmaillastaccessre.findall(line) if len(gmaillastaccesses)>0: #this line maybe a last access, they have at least one ip in them, so does this match? (some times they come in other lines of html which we don't parse) if len(ipre.findall(line))>0: for la in gmaillastaccesses: try: #convert it to a list and print out the subsections laAsList=[] laAsList=la.replace('[','').replace(']','').split(',') print "last access: %s from IP %s, most recent access %s from IP %s" %(laAsList[1],laAsList[3],ctime(float(laAsList[8])),laAsList[9]) except: print"last access (can't parse it): " + str(la) #done with line by line proccessing #look for message headers, they can cross multiple lines: gmailmessageheaders=gmailmessagere.findall(filedata) #print repr(filedata) if len(gmailmessageheaders)>0: for mh in gmailmessageheaders: try: #looks like gmail is unicode encoded, rather than substitute all those chars, we'll use python's unicode support if possible umh=unicode(str(mh),'unicode-escape') print "message header: " + umh except: #something amiss, dump what we've got print "message headers: " + str(gmailmessageheaders) #second attempt at message header (datapack type mb instead of ms) gmailmessageheaders2=gmailmessagere2.findall(filedata) if len(gmailmessageheaders2)>0: for mh in gmailmessageheaders2: try: #looks like gmail is unicode encoded, rather than substitute all those chars, we'll use python's unicode support if possible umh=unicode(str(mh),'unicode-escape') print "message header: " + umh except: #something amiss, dump what we've got print "message headers: " + str(gmailmessageheaders2) if options["bodies"]: #try to piece together any message bodies, best guess is that they begin with datpack "ms" and end after multiple lines, in memory usually missing the ending brackets, #example: ["ms","113b0d734737dec4","", .... ["me","someemail@gmail.com"] #so we look for the corresponding 'me' record after the "ms" datapack type. #first take at the regex: #messagebodyregex=r'\["ms.*?' + str(gmailacctme).replace('[','\[').replace(']','\]') #2nd take to account for messy full memory dumps that tend to false positive on the above #find the ms datapack, and the ending "me" record with no more than 10k in between. messagebodyregex=r'((?:\["ms\",".{10,20}",""){1}.{200,10000}?(?:\[' + str(gmailacctme).replace('[','\[').replace(']','\]') + r'))' if options["verbose"]: sys.stderr.write('regex for messagebody is: ' +str(messagebodyregex) + '\n') gmailmessagebodyre=re.compile(messagebodyregex, re.IGNORECASE|re.DOTALL) gmailmessagebodies=gmailmessagebodyre.findall(filedata) if len(gmailmessagebodies)>0: for mb in gmailmessagebodies: try: #looks like gmail is unicode encoded, rather than substitute all those chars, we'll use python's unicode support if possible umb=unicode(str(mb),'unicode-escape') print "message body: " + umb except: #something amiss...dump what we've got. print "message bodies:" + str(gmailmessagebodies) except: sys.stderr.write("Error handling line:" + line) def main(): global options options = parseOptions() gatherArtifacts() if __name__ == '__main__': main()