from forensics.object2 import Object

# Data structures reverse engineered from mshtml.dll
# Version: 7.0.6000.16809
mshtml_types = {
  'CDoc' : [ 0x484, {
    'Lookaside' : [ 0x68, ['CHtPvPv']],
    'pWindowProxy' : [ 0x168, ['pointer', ['COmWindowProxy']]],
    'flags' : [ 0x480, ['unsigned long']],
} ],
  'CHtPvPv' : [ 0x18, {
    'pHashTable' : [ 0x0, ['pointer', ['_HTENTRY']]],
    'numElements' : [ 0x14, ['unsigned long']],
} ],
  '_HTENTRY' : [ 0x8, {
    'hkey' : [ 0x0, ['unsigned long']],
    'hvalue' : [ 0x4, ['unsigned long']],
} ],
  'COmWindowProxy' : [ 0x34, {
    'pWindow' : [ 0x30, ['pointer', ['CWindow']]],
} ],
  'CWindow' : [ 0x30, {
    'pMarkup' : [ 0x24, ['pointer', ['CMarkup']]],
    'uri' : [ 0x2c, ['pointer', ['CDocument']]],
} ],
  'CDocument' : [ 0x18, {
    'pWindow' : [ 0x14, ['pointer', ['CWindow']]],
} ],
  'CMarkup' : [ 0x58, {
    'txtArray': [ 0x28, ['CArrayBase']],
    'pvBitmap' : [ 0x90, ['unsigned long']],
    'pSecurity' : [ 0x54, ['pointer', ['CSecurityContext']]],
    'rootTreePos' : [ 0x5c, ['CTreePos']],
} ],
  'CSecurityContext' : [ 0x10, {
    'pDoc' : [ 0xC, ['pointer', ['CDoc']]],
} ],
  'CTreeNode' : [ 0x30, {
    'pElement' : [ 0x0, ['pointer', ['CElement']]],
    'parent' : [ 0x4, ['pointer', ['CTreeNode']]],
    'tagIdx' : [ 0x8, ['unsigned char']],
    'tpBegin' : [ 0x10, ['CTreePos']],
    'tpEnd' : [ 0x20, ['CTreePos']],
} ],
  'CTreePos' : [ 0x10, {
    'flags' : [ 0x0, ['unsigned long']],
    'nchLeft' : [ 0x4, ['unsigned long']], # left subtree character count?
    'pChild' : [ 0x8, ['pointer', ['CTreePos']]],
    'pNext' : [ 0xC, ['pointer', ['CTreePos']]],
    'txtFlags': [ 0x10, ['unsigned long']], # txtFlags & 0x3FFFFFF = number of characters
} ],
  'CElement' : [ 0x24, {
    'hasAAIndex' : [ 0xC, ['unsigned long']],
    'pTreeNode' : [ 0x10, ['pointer', ['CTreeNode']]],
    'tagDescIdx' : [ 0x14, ['unsigned char']],
    'flags' : [ 0x18, ['unsigned long']],
    'pMarkup' : [ 0x20, ['pointer', ['CMarkup']]],
} ],
  'CElemCache' : [ 0x10, {
    'pHtml' : [ 0x0, ['pointer', ['CElement']]],
    'pHead' : [ 0x4, ['pointer', ['CElement']]],
    'pTitle' : [ 0x8, ['pointer', ['CElement']]],
    'pBody' : [ 0xC, ['pointer', ['CElement']]],
} ],
  'CArrayBase' : [ 0x10, {
    'pStart': [ 0x0, ['pointer', ['void']]],
    'numElements': [ 0x4, ['unsigned long']],
    'elemSize': [ 0xC, ['unsigned long']],
} ],
  'CTextArrayElem': [ 0x10, {
    'nChars': [ 0x0, ['unsigned long']],
    'pBuff': [ 0x4, ['pointer', ['void']]],
    'gapOff': [ 0x8, ['unsigned long']],
    'bufLen': [ 0xC, ['unsigned long']],
} ],
}

# Values for CTreePos.flags
TP_FLAGS = {
  'TP_BEGIN':   0x01,
  'TP_END':     0x02,
  'TP_TEXT':    0x04,
  'TP_PTR':     0x08,
  'TP_FIRST':   0x10,
  'TP_LAST':    0x20,
  'TP_TXTINFO': 0x40,
  'TP_DATA':    0x80,
}

# Tag names used by IE
# This is the table CElement.tagDescIdx refers to
# Created using Windbg:
#  Names:
#   .for ( r $t0 = 0 ; @$t0 < 89 ; r $t0 = @$t0 + 1) {
#       du poi(mshtml!g_atagdesc+(@$t0 << 4))
#   }
#  Flags:
#   .for ( r $t0 = 0 ; @$t0 < 89 ; r $t0 = @$t0 + 1) {
#       dd mshtml!g_atagdesc+(@$t0 << 4)+C L1
#   }
TAGDESC = [
    ("", 0x00000001),
    ("", 0x20000001),
    ("A", 0x00000000),
    ("ABBR", 0x00000020),
    ("ACRONYM", 0x00000020),
    ("ADDRESS", 0x00080042),
    ("APPLET", 0x08010001),
    ("AREA", 0x00000001),
    ("B", 0x00000020),
    ("BASE", 0x00410000),
    ("BASEFONT", 0x00000000),
    ("BDO", 0x00000020),
    ("BGSOUND", 0x00010001),
    ("BIG", 0x00000020),
    ("BLINK", 0x00000020),
    ("BLOCKQUOTE", 0x008c0022),
    ("BODY", 0x22191802),
    ("BR", 0x00000001),
    ("BUTTON", 0x02081800),
    ("CAPTION", 0x00080802),
    ("CENTER", 0x00080002),
    ("CITE", 0x00000020),
    ("CODE", 0x00000020),
    ("COL", 0x00080003),
    ("COLGROUP", 0x00080002),
    ("COMMENT", 0x0000c001),
    ("DD", 0x0008004a),
    ("", 0x00000001),
    ("DEL", 0x00000020),
    ("DFN", 0x00000020),
    ("DIR", 0x00080046),
    ("DIV", 0x00880082),
    ("DL", 0x000c0046),
    ("DT", 0x0008004a),
    ("EM", 0x00000020),
    ("EMBED", 0x00000001),
    ("FIELDSET", 0x00880882),
    ("FONT", 0x00000020),
    ("FORM", 0x20080002),
    ("FRAME", 0x02010001),
    ("FRAMESET", 0x20010000),
    ("H1", 0x008800d2),
    ("H2", 0x008800d2),
    ("H3", 0x008800d2),
    ("H4", 0x008800d2),
    ("H5", 0x008800d2),
    ("H6", 0x008800d2),
    ("HEAD", 0x00100000),
    ("HR", 0x00000401),
    ("HTML", 0x60110000),
    ("I", 0x00000020),
    ("IFRAME", 0x00110001),
    ("IMG", 0x00010001),
    ("INPUT", 0x03011001),
    ("INS", 0x00000020),
    ("ISINDEX", 0x20000001),
    ("KBD", 0x00000020),
    ("LABEL", 0x00000000),
    ("LEGEND", 0x02080c00),
    ("LI", 0x0000000a),
    ("LINK", 0x14010001),
    ("LISTING", 0x00180002),
    ("MAP", 0x00040000),
    ("MARQUEE", 0x02080802),
    ("MENU", 0x000c0046),
    ("META", 0x00210001),
    ("NEXTID", 0x00000001),
    ("NOBR", 0x00000020),
    ("NOEMBED", 0x20010001),
    ("NOEMBED", 0x00000000),
    ("NOFRAMES", 0x00010001),
    ("NOFRAMES", 0x00000000),
    ("NOSCRIPT", 0x20010001),
    ("NOSCRIPT", 0x00000000),
    ("OBJECT", 0x09090001),
    ("OL", 0x00040046),
    ("OPTION", 0x00000000),
    ("P", 0x000800c2),
    ("PARAM", 0x00000001),
    ("PLAINTEXT", 0x0009c102),
    ("PRE", 0x00180042),
    ("Q", 0x00000020),
    ("", 0x00000802),
    ("RP", 0x00000020),
    ("RT", 0x00000020),
    ("RUBY", 0x00000020),
    ("S", 0x00000020),
    ("SAMP", 0x00000020),
    ("SCRIPT", 0x0811c001),
    ("SELECT", 0x01040000),
    ("SMALL", 0x00000020),
    ("SPAN", 0x00000000),
    ("STRIKE", 0x00000020),
    ("STRONG", 0x00000020),
    ("STYLE", 0x0010c001),
    ("SUB", 0x00000020),
    ("SUP", 0x00000020),
    ("TABLE", 0x000d0202),
    ("TBODY", 0x00080002),
    ("", 0x00080802),
    ("TD", 0x000c0802),
    ("TEXTAREA", 0x031c5000),
    ("TFOOT", 0x00080002),
    ("TH", 0x000c0802),
    ("THEAD", 0x00080002),
    ("TITLE", 0x00106000),
    ("TR", 0x000c0002),
    ("TT", 0x00000020),
    ("U", 0x00000020),
    ("UL", 0x00040046),
    ("VAR", 0x00000020),
    ("WBR", 0x00000001),
    ("XMP", 0x0018c002),
    ("", 0x00000000),
    ("", 0x0000c001),
    ("", 0x0000c001),
    ("", 0x00000000),
    ("TITLE", 0x00104001),
    ("OPTGROUP", 0x00040000),
    ("!", 0x00000001),
    ("", 0x00000001),
    ("", 0x00000001),
    ("", 0x00000001),
    ("", 0x28000001),
    ("", 0x20000001),
    ("", 0x20000001),
    ("", 0x20000001),
    ("", 0x20000001),
    ("", 0x20000001),
    ("", 0x20010001),
    ("", 0x20010001),
    ("", 0x00000000),
    ("", 0x24010001),
    ("", 0x20010001),
    ("", 0x20010001),
    ("", 0x00000001),
    ("", 0x20010001),
]

HASEND = [
    0,
    0,
    1,
    1,
    1,
    2,
    2,
    0,
    1,
    0,
    1,
    1,
    0,
    1,
    1,
    2,
    2,
    0,
    2,
    2,
    1,
    1,
    1,
    0,
    2,
    2,
    2,
    2,
    1,
    1,
    2,
    2,
    2,
    2,
    1,
    0,
    2,
    1,
    1,
    0,
    2,
    1,
    1,
    1,
    1,
    1,
    1,
    2,
    0,
    2,
    1,
    2,
    0,
    0,
    1,
    0,
    1,
    1,
    2,
    2,
    0,
    2,
    1,
    2,
    2,
    0,
    0,
    1,
    2,
    1,
    2,
    1,
    2,
    1,
    2,
    2,
    2,
    2,
    0,
    2,
    2,
    1,
    2,
    1,
    1,
    1,
    1,
    1,
    2,
    2,
    1,
    1,
    1,
    1,
    2,
    1,
    1,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    1,
    1,
    2,
    1,
    0,
    2,
    1,
    2,
    2,
    1,
    2,
    2,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
]

# Generally found by looking at the CElement::Is*Tag functions
# Note: many flags missing here...
TAGDESC_FLAGS = {
    'BLOCK_TAG' : 0x02,
    'NEEDSNBSP' : 0x80000,
    'HASNOEND' : 0x400000, # From TagHasNoEndTag
}

def get_html(iesrv):
    from cStringIO import StringIO
    import cgi # For escaping HTML

    vm = iesrv.Win32Thread.pEThread.ThreadsProcess.vm
    cdoc = Object("CDoc", iesrv.dwUserData, vm, profile=iesrv.profile)
    
    # Try to get the "top" element
    markup = cdoc.pWindowProxy.pWindow.pMarkup

    ec = markup.elem_cache()
    
    s = StringIO()
    did_text = False
    cur = ec.pHtml.pTreeNode.tpBegin
    while cur:
        if cur.has_elem():
            elem = cur.get_treenode().pElement
            if cur.flags & TP_FLAGS['TP_END']:
                if elem.needs_nbsp() and not did_text:
                    s.write("&nbsp;")
                if elem.has_end():
                    s.write(cur.elem_tag())
                if elem.is_block():
                    s.write("\n")
            else:
                s.write(cur.elem_tag())
                did_text = False
        elif cur.is_text():
            text = markup.get_text(cur)
            text = cgi.escape(text)
            s.write(text)
            did_text = True
        elif cur.is_ptr():
            pass
        else:
            pass
        cur = cur.next_treepos()
    return s.getvalue()
