# `de_obf_helper.py`
# Helper for deobfuscation of Javascript code.
# Author: follower@myrealbox.com
# Copyright: 2005
# License: GPL 2.0
# Version: 0.1.0
# MOTD: Are you all *insane*???
# Example usage: (Using Google Maps code)
# Note: All code must be pretty-printed in the files.
# * Copy file `maps.1.js`
# * Execute ('-d' = generate documention):
# ./de_obf_helper.py -d maps.1.js > maps.1.html
# * Generates or updates these files:
# maps.1.html funcs-maps.1.js.txt maps.1.js-linenums.html
# * Edit `funcs-maps.1.js.txt`, the format is:
# class/function nameargsdeobfuscated namedescription
# e.g.:
# Y Vc XSLT Handles XSLT processing
# * Re-run above '-d' command to refresh documentation.
# * Then, when target code has been modified and obfuscated differently:
# * Copy file `maps.2.js`
# * Execute ('-u' = upgrade functions/class metadata to new names):
# ./de_obf_helper.py -u maps.1.js maps.2.js
# * Generates or updates:
# funcs-maps.2.js.txt
# * Execute:
# ./de_obf_helper.py -d maps.2.js > maps.2.html
# and documentation matching the most recent obfuscation is generated.
# Note: The "upgrade" step is not perfect, it uses various heuristics to
# attempt to identify the same function/class in each file.
# Note: This code is very rough, with hard coded stuff all over the place,
# but it does the job for me so I figured I'd throw it up on the net.
# Perhaps a little too literally... :-)
import os
import re
import sys
import sets
import shutil
import difflib
import textwrap
RE_FUNCTION_DEF = re.compile("function\s+?(.+?)\((.*?)\)")
#RE_FUNCTION_DEF = re.compile("^function\s+?(.+?)\((.*?)\)", re.DOTALL | re.MULTILINE)
RE_METHOD_DEF = re.compile("^(\w+?)\.([\w.]+?)=function\((.*?)\)", re.MULTILINE)
# TODO: Make this all lazy, but cached.
class SourcePropertyMixin(object): # TODO: Abstract out more functionality?
def _getSource(self):
return self._source[self.startIdx:self.endIdx]
source = property(_getSource, doc = "")
class Function(SourcePropertyMixin, object): # TODO: Rename? & Class-ify?
def __init__(self, functionName, functionArgs,
functionNewName = "", functionDescription = "",
text=None, lineNumber = None,
startIdx = None, endIdx = None,
source = None):
self.name = functionName
self.args = functionArgs
self.newName = functionNewName
self.description = functionDescription
self.text = text
self.lineNumber = lineNumber
# Offset of function source within the source file.
self.startIdx = startIdx
self.endIdx = endIdx
self._source = source
self.methods = []
self.otherName = "" # TODO: Handle this better. Key by file name?
def update(self, source):
* `source` another Function instance.
#if source.name == self.name:
for attr in ["args", "newName", "description"]: #TODO:Improve this?
newValue = getattr(source, attr)
if newValue:
setattr(self, attr, newValue)
# raise Exception("Functions are not of the same name")
def asExportString(self):
return "\t".join([self.name, self.args, self.newName,
def _getSignature(self):
return [m.name for m in self.methods]
signature = property(_getSignature, doc="")
# TODO: Make this static?
def functionFromString(metaData):
* metaData - Tab delimited funcName, funcNewName, funcArgs, funcDesc
funcName, funcArgs, funcNewName, funcDesc = metaData.split("\t")
return Function(funcName, funcArgs, funcNewName, funcDesc)
class Method(SourcePropertyMixin, object):
def __init__(self, methodName, methodArgs,
text=None, lineNumber = None,
startIdx = None, endIdx = None,
source = None):
# TODO: Record if 'prototype'?
self.name = methodName.replace("prototype.", "")
self.args = methodArgs
self.text = text
self.lineNumber = lineNumber
# Offset of function source within the source file.
self.startIdx = startIdx
self.endIdx = endIdx
self._source = source
def exportFuncs(filename, funcs):
if os.path.exists(filename):
shutil.copy(filename, "%s.bak" % filename)
for line in open(filename):
funcMetaData = functionFromString(line[:-1])
# TODO: Improve this...
except KeyError: # TODO: Only catch missing funcName?
funcs[funcMetaData.name] = funcMetaData
fh = open(filename, "w")
for funcMetaData in funcs.values():
fh.write("%s\n" % funcMetaData.asExportString())
#BASE_RE_FUNCTION_REFERENCES = "([\w.])+?\s*?=\s*?%s\((.+?)\)"
#BASE_RE_FUNCTION_REFERENCES = "^[^\n]*?([\w.])+?\s*?=\s*(.*?)\s*%s\((.*?)\).*?$"
# TODO: Check this matches all references correctly.
#BASE_RE_FUNCTION_REFERENCES = "^.*?\W%s(\((.*?)\).*?$|($|\W.*?$))"
BASE_RE_FUNCTION_REFERENCES = "^.*?\W%s\((.*?)\).*?$"
#BASE_RE_FUNCTION_REFERENCES = "^.*?\W%s(\((.*?)\).*?$|[\s;]*?$)"
def getReferences(source, funcName):
re_functionReference = re.compile(BASE_RE_FUNCTION_REFERENCES % funcName,
return [match.group(0)
for match in re.finditer(re_functionReference, source)
if not match.group(0).startswith("function")]
def heading(level, klass, content, id_ = None):
if id_:
# Ewww...
format = format.replace(' ', ' id="%s" ' % id_ , 1)
return format % (level, klass, content, level)
def paragraph(klass, content):
return '%s
' % (klass, content)
def href(url, text):
return '%s ' % (url, text)
def table(datalines, columns):
parts = []
numPerCol = len(datalines) / columns
count = 0
while count <= len(datalines):
parts.append('%s ' % (100 / columns,
" \n".join(datalines[count:count+numPerCol])))
count += numPerCol
return "\n".join(parts)
# TODO: Find the proper one...
def escape(text):
return text.replace("&", "&").replace("<","<")
def referenceList(func, references):
parts = []
parts.append(heading(3, "references", "References:"))
uniqueReferences = list(sets.Set([r.strip() for r in references]))
for reference in uniqueReferences[:MAX_REFS_TO_DISPLAY]:
if func.newName:
# TODO: Check this substitution handles all cases.
reference = re.sub("(\W|^)%s(\W|$)" % func.name,
r"\1%s\2" % func.newName.replace("~",""),
parts.append('%s '
% escape(reference))
if len(uniqueReferences) > MAX_REFS_TO_DISPLAY:
parts.append('... ')
parts.append(" ")
return "\n".join(parts)
def formatFunction(f, sourceURL):
parts = []
name = f.newName.replace("~","") or f.name
parts.append(heading(2, "fname",
'%s [%s ]' %
(name, sourceURL, f.lineNumber, f.name), name))
parts.append(heading(3, "section", "Args:"))
f.args.replace(",",", "))) #TODO: Change formatting.
parts.append(paragraph("desc", f.description))
if f.methods:
parts.append(heading(3, "section", "Methods:"))
for m in f.methods:
parts.append(paragraph("method", "%s(%s)" % (m.name, m.args)))
return (href("#%s" % name, name) , "\n".join(parts))
def extractFunctionInfo(source):
funcs = {}
for match in RE_FUNCTION_DEF.finditer(source):
text = match.group(0)
funcName, args = match.groups()
startIdx = source.index(text)
lineNumber = source[:startIdx].count("\n") + 1
dummy, endIdx = getBlockOffsets(source, startIdx)
funcs[funcName] = Function(funcName, args,
text=text, lineNumber=lineNumber,
startIdx=startIdx, endIdx=endIdx,
#print "Functions found: %d" % (len(funcs))
#print funcs.keys()
for match in RE_METHOD_DEF.finditer(source):
text = match.group(0)
parent, methodName, args = match.groups()
func = funcs[parent]
except KeyError:
# It's adding to builtin object...
#print parent #TODO: Output this...
startIdx = source.index(text)
lineNumber = source[:startIdx].count("\n")
dummy, endIdx = getBlockOffsets(source, startIdx)
func.methods.append(Method(methodName, args,
startIdx=startIdx, endIdx=endIdx,
return funcs
def getBlockOffsets(source, startIdx):
startIdx = idx = source.index("{", startIdx) # TODO: Catch ValueError when no more?
obcount = 1
while obcount !=0:
idx += 1
nextobIdx = source.index("{", idx)
except ValueError:
nextobIdx = len(source)
nextcbIdx = source.index("}", idx, nextobIdx)
except ValueError:
idx = nextobIdx
obcount += 1
if nextobIdx < nextcbIdx:
idx = nextobIdx
obcount += 1
idx = nextcbIdx
obcount -= 1
endIdx = idx + 1
return (startIdx, endIdx)
def formatLine(lineNumber, text):
return ' %s' % (lineNumber, escape(text))
def generateNumberedSource(sourceFilename, outputFilename):
parts = []
for lineNumber, text in enumerate(open(sourceFilename)):
parts.append(formatLine(lineNumber, text))
if os.path.exists(outputFilename):
shutil.copy(outputFilename, "%s.bak" % outputFilename)
open(outputFilename, "w").write( "".join(parts))
def generateDocumentation(source, funcs, sourceURL):
print """
Google Maps Classes and Functions Reference
print """
print ""
print """
Google Maps Classes and Functions Reference
Rough initial version. Based on original annotation from http://spaces.msn.com/members/sompost/ with additions.
Original name (as listed in %s ) is given in square brackets and linked to the original source. Meta data is from Google Maps meta data file . Generated by Javascript deobfuscation helper .
""" % (sourceURL.replace("-linenums.html",""),
"funcs-%s.txt" % sourceURL.replace("-linenums.html",""))
sections = []
toc = []
sortedFuncs = [(f.newName.lower() or f.name.lower(), f)
for f in funcs.values()]
for dummy, f in sortedFuncs:
tocEntry, section = formatFunction(f, sourceURL)
references = getReferences(source, f.name)
if references:
sections.append(referenceList(f, references))
print ""
print table(toc, 4)
print "
print "\n".join(sections)
print ""
def usage():
print "Usage: %s (-d | -u )" % sys.argv[0]
raise SystemExit
# TODO: Make this method of Function?
def getArgsSig(f):
Based on function/constructor args?
if f.args:
fargs = f.args.split(",")
fargs = []
return fargs
def findSignatureMatches(targetFunction, allFunctions):
# TODO: Refactor copy & pasting...
# TODO: Cache any of this stuff?
sigMatches = []
# Method signatures
for mf in allFunctions:
if mf.signature == targetFunction.signature:
# This is strongest match so we not check anything else?
# Constructor/function arg count signatures
if len(sigMatches) > 1:
for mf in sigMatches:
if not (len(getArgsSig(mf)) == len(getArgsSig(targetFunction))):
# Constructor/function source code match
if len(sigMatches) > 1:
matchSources = [mf.source for mf in sigMatches]
closeSources = difflib.get_close_matches(f.source,
n = 2)
if len(closeSources) == 1:
sigMatches = [sigMatches[matchSources.index(closeSources[0])]]
# TODO: Handle multiple closematches?
# First method source code match
if (len(sigMatches) > 1) and f.methods:
# TODO: Check methods are present.
# TODO: Check against all methods?
matchSources = [mf.methods[0].source for mf in sigMatches]
closeSources = difflib.get_close_matches(f.methods[0].source,
n = 2)
if len(closeSources) == 1:
sigMatches = [sigMatches[matchSources.index(closeSources[0])]]
# TODO: Handle multiple close matches?
# Close method signatures
if not sigMatches:
allSigs = [mf.signature for mf in allFunctions]
closeSigs = difflib.get_close_matches(f.signature,
n = 2)
if len(closeSigs) == 1:
sigMatches = [allFunctions[allSigs.index(closeSigs[0])]]
# TODO: Handle multiple close matches?
return sigMatches
if __name__ == "__main__":
option = sys.argv[1]
except IndexError:
if option == "-d":
sourceFilename1 = sys.argv[2]
except IndexError:
elif option =="-u":
sourceFilename1 = sys.argv[2]
sourceFilename2 = sys.argv[3]
except IndexError:
source1 = open(sourceFilename1).read()
funcs1 = extractFunctionInfo(source1)
exportFuncs("funcs-%s.txt" % sourceFilename1, funcs1)
if option == "-d": # "d"ocument
linenumsFilename = "%s-linenums.html" % sourceFilename1
generateNumberedSource(sourceFilename1, linenumsFilename)
generateDocumentation(source1, funcs1, linenumsFilename)
elif option == "-u": # "u"pgrade
source2 = open(sourceFilename2).read()
funcs2 = extractFunctionInfo(source2)
allFuncs1 = funcs1.values()
allFuncs2 = funcs2.values()
itemMatched = True
while allFuncs1 and allFuncs2 and itemMatched:
itemMatched = False
for f in allFuncs1[:]:
sigMatches = findSignatureMatches(f, allFuncs2)
if len(sigMatches) == 1:
itemMatched = True
matchedF = sigMatches[0]
# TODO: Allow for manual confirmation.
#print "\n------------------------------"
#print f.name, matchedF.name, f.newName
#print f.source
#print matchedF.source
if allFuncs1 and not allFuncs2:
print "%d items exist in old file but not in new file." % \
if allFuncs2 and not allFuncs1:
print "%d items exist in new file but not in old file." % \
if allFuncs1 and allFuncs2:
print "%d items in old file and "\
"%d items in new file not matched."% (len(allFuncs1),
exportFuncs("funcs-%s.txt" % sourceFilename2, funcs2)