Androguard的部分源碼(一)——androaxml.py

廢話少說,上代碼。


option_0 = { 'name' : ('-i', '--input'), 'help' : 'filename input (APK or android\'s binary xml)', 'nargs' : 1 }
option_1 = { 'name' : ('-o', '--output'), 'help' : 'filename output of the xml', 'nargs' : 1 }
option_2 = { 'name' : ('-v', '--version'), 'help' : 'version of the API', 'action' : 'count' }
options = [option_0, option_1, option_2]


def main(options, arguments):
    if options.input != None:
        buff = ""

        ret_type = androconf.is_android(options.input)  #讀取文件頭判斷文件類型
        if ret_type == "APK":
            a = apk.APK(options.input)
            buff = a.get_android_manifest_xml().toprettyxml(encoding="utf-8")
        elif ".xml" in options.input:
            ap = apk.AXMLPrinter(read(options.input))   
            buff = minidom.parseString(ap.get_buff()).toprettyxml(encoding="utf-8")
        else:
            print "Unknown file type"
            return

        if options.output != None:  #創建輸出文件
            fd = codecs.open(options.output, "w", "utf-8")
            fd.write( buff )
            fd.close()
        else:                       #否則輸出到屏幕
            print buff

    elif options.version != None:
        print "Androaxml version %s" % androconf.ANDROGUARD_VERSION

if __name__ == "__main__":
    parser = OptionParser()
    for option in options:
        param = option['name']
        del option['name']
        parser.add_option(*param, **option)

    options, arguments = parser.parse_args()
    sys.argv[:] = arguments
    main(options, arguments)


這是androaxml.py的全部源碼。幾個內容

第一,參數。一個input,可以是apk,或者AndroidManfest.xml。一個output,這是指定的輸出文件名,如果不指定輸出文件名,則輸出到屏幕。

第二,如果爲apk,則使用APK()解析

    def get_android_manifest_xml(self):
        """
            Return the xml object which corresponds to the AndroidManifest.xml file

            :rtype: object
        """
        try:
            return self.xml["AndroidManifest.xml"]
        except KeyError:
            return None

如果是AndroidManfest.xml,則使用AXMLPrinter

而在APK.__init__函數中有這樣一段


        if zipmodule == 0:
            self.zip = ChilkatZip(self.__raw)
        elif zipmodule == 2:
            from androguard.patch import zipfile
            self.zip = zipfile.ZipFile(StringIO.StringIO(self.__raw), mode=mode)
        else:
            import zipfile
            self.zip = zipfile.ZipFile(StringIO.StringIO(self.__raw), mode=mode)

        for i in self.zip.namelist():
            if i == "AndroidManifest.xml":
                self.axml[i] = AXMLPrinter(self.zip.read(i))
                try:
                    self.xml[i] = minidom.parseString(self.axml[i].get_buff())
                except:
                    self.xml[i] = None

對apk文件利用ChilkatZip或者ZipFile進行解壓,然後從解壓後的文件列表當中遍歷獲取AndroidManfest.xml,再對AndroidManfest.xml

調用AXMLPrinter,所以核心的處理在AXMLPrinter當中。

AXMLPrinter則是用AXMLParser對文件進行解析。

所以處理流程就清晰了

APK: 生成APK class實例 ——> 解壓文件 ——> 遍歷獲取AndroidManfest.xml ——> AXMLPrinter實例 ——> AXMLParser實例解析

XML: AXMLPrinter實例 ——> AXMLParser實例解析

class AXMLParser(object):
    def __init__(self, raw_buff):
        self.reset()

        self.valid_axml = True
        self.buff = bytecode.BuffHandle(raw_buff)

        axml_file = unpack('<L', self.buff.read(4))[0]  #讀取文件頭

        if axml_file == CHUNK_AXML_FILE:                #判斷文件頭
            self.buff.read(4)

            self.sb = StringBlock(self.buff)            #字符串池

            self.m_resourceIDs = []
            self.m_prefixuri = {}
            self.m_uriprefix = {}
            self.m_prefixuriL = []

            self.visited_ns = []
        else:
            self.valid_axml = False
            androconf.warning("Not a valid xml file")
AXMLParser.buff結構


self.__buff保存內容

self.__idx保存已解析的長度,也就是下次解析的起點

class AXMLPrinter(object):
    def __init__(self, raw_buff):
        self.axml = AXMLParser(raw_buff)    #實例化AXMLParser
        self.xmlns = False

        self.buff = u''
        #主處理邏輯
        while True and self.axml.is_valid():
            _type = self.axml.next()
#           print "tagtype = ", _type

            if _type == START_DOCUMENT:
                self.buff += u'<?xml version="1.0" encoding="utf-8"?>\n'
            elif _type == START_TAG:
                self.buff += u'<' + self.getPrefix(self.axml.getPrefix()) + self.axml.getName() + u'\n'
                self.buff += self.axml.getXMLNS()

                for i in range(0, self.axml.getAttributeCount()):
                    self.buff += "%s%s=\"%s\"\n" % (self.getPrefix(
                        self.axml.getAttributePrefix(i)), self.axml.getAttributeName(i), self._escape(self.getAttributeValue(i)))

                self.buff += u'>\n'

            elif _type == END_TAG:
                self.buff += "</%s%s>\n" % (self.getPrefix(self.axml.getPrefix()), self.axml.getName())

            elif _type == TEXT:
                self.buff += "%s\n" % self.axml.getText()

            elif _type == END_DOCUMENT:
                break
AXMLParser實例化完成後進入主處理邏輯

在前一篇文章反編譯編譯後的AndroidManifest 當中也有一段類似的處理邏輯,實現大同小異,都是讀取tag,判斷是什麼chunk,然後然後處理,可以對比一下。

    def next(self):
        self.doNext()
        return self.m_event
next函數調用doNext

    def doNext(self):
        if self.m_event == END_DOCUMENT:    #文件結束
            return

        event = self.m_event

        self.reset()
        while True:
            chunkType = -1

            # Fake END_DOCUMENT event.
            if event == END_TAG:            #tag結束標誌
                pass

            # START_DOCUMENT
            if event == START_DOCUMENT:     #Start Tag Chunk
                chunkType = CHUNK_XML_START_TAG
            else:
                if self.buff.end():         #文件是否結束
                    self.m_event = END_DOCUMENT
                    break
                chunkType = unpack('<L', self.buff.read(4))[0] #讀取後四位

            if chunkType == CHUNK_RESOURCEIDS:                  #ResourceId Chunk
                chunkSize = unpack('<L', self.buff.read(4))[0]
                # FIXME
                if chunkSize < 8 or chunkSize % 4 != 0:         #長度是否合法
                    androconf.warning("Invalid chunk size")

                for i in range(0, chunkSize / 4 - 2):
                    self.m_resourceIDs.append(unpack('<L', self.buff.read(4))[0])

                continue

            # FIXME
            if chunkType < CHUNK_XML_FIRST or chunkType > CHUNK_XML_LAST:   #無法識別的tag
                androconf.warning("invalid chunk type")

            # Fake START_DOCUMENT event.
            if chunkType == CHUNK_XML_START_TAG and event == -1:    #第一次讀到Start Tag Chunk時,event爲-1
                self.m_event = START_DOCUMENT                       #將event設置爲START_DOCUMENT之後退出
                break                                               #返回到主處理邏輯

            self.buff.read(4)  # /*chunkSize*/
            lineNumber = unpack('<L', self.buff.read(4))[0]
            self.buff.read(4)  # 0xFFFFFFFF

            if chunkType == CHUNK_XML_START_NAMESPACE or chunkType == CHUNK_XML_END_NAMESPACE:
                if chunkType == CHUNK_XML_START_NAMESPACE:                  #Start Namespace Chunk
                    prefix = unpack('<L', self.buff.read(4))[0]
                    uri = unpack('<L', self.buff.read(4))[0]

                    self.m_prefixuri[prefix] = uri
                    self.m_uriprefix[uri] = prefix
                    self.m_prefixuriL.append((prefix, uri))
                    self.ns = uri
                else:                                                       #End Namespace Chunk
                    self.ns = -1
                    self.buff.read(4)
                    self.buff.read(4)
                    (prefix, uri) = self.m_prefixuriL.pop()
                    #del self.m_prefixuri[ prefix ]
                    #del self.m_uriprefix[ uri ]

                continue

            self.m_lineNumber = lineNumber

            if chunkType == CHUNK_XML_START_TAG:    #第二次讀取到Start Tag Chunk,此時event爲START_DOCUMENT
                self.m_namespaceUri = unpack('<L', self.buff.read(4))[0]
                self.m_name = unpack('<L', self.buff.read(4))[0]

                # FIXME
                self.buff.read(4)  # flags

                attributeCount = unpack('<L', self.buff.read(4))[0]
                self.m_idAttribute = (attributeCount >> 16) - 1
                attributeCount = attributeCount & 0xFFFF
                self.m_classAttribute = unpack('<L', self.buff.read(4))[0]
                self.m_styleAttribute = (self.m_classAttribute >> 16) - 1

                self.m_classAttribute = (self.m_classAttribute & 0xFFFF) - 1

                for i in range(0, attributeCount * ATTRIBUTE_LENGHT):
                    self.m_attributes.append(unpack('<L', self.buff.read(4))[0])

                for i in range(ATTRIBUTE_IX_VALUE_TYPE, len(self.m_attributes), ATTRIBUTE_LENGHT):
                    self.m_attributes[i] = self.m_attributes[i] >> 24

                self.m_event = START_TAG
                break

            if chunkType == CHUNK_XML_END_TAG:
                self.m_namespaceUri = unpack('<L', self.buff.read(4))[0]
                self.m_name = unpack('<L', self.buff.read(4))[0]
                self.m_event = END_TAG
                break

            if chunkType == CHUNK_XML_TEXT:
                self.m_name = unpack('<L', self.buff.read(4))[0]

                # FIXME
                self.buff.read(4)
                self.buff.read(4)

                self.m_event = TEXT
                break

doNext函數很長。關注的重點在while循環中。當讀取到ResourceId Chunk和Namespace Chunk

則continue。而第一次讀到Start Tag Chunk的時候則會退出,返回到AXMLPrinter的主處理邏輯當中。

再看一下AXMLPrinter

        while True and self.axml.is_valid():
            _type = self.axml.next()
#           print "tagtype = ", _type

            if _type == START_DOCUMENT:
                self.buff += u'<?xml version="1.0" encoding="utf-8"?>\n'
            elif _type == START_TAG:
                self.buff += u'<' + self.getPrefix(self.axml.getPrefix()) + self.axml.getName() + u'\n'
                self.buff += self.axml.getXMLNS()

                for i in range(0, self.axml.getAttributeCount()):
                    self.buff += "%s%s=\"%s\"\n" % (self.getPrefix(
                        self.axml.getAttributePrefix(i)), self.axml.getAttributeName(i), self._escape(self.getAttributeValue(i)))

                self.buff += u'>\n'

            elif _type == END_TAG:
                self.buff += "</%s%s>\n" % (self.getPrefix(self.axml.getPrefix()), self.axml.getName())

            elif _type == TEXT:
                self.buff += "%s\n" % self.axml.getText()

            elif _type == END_DOCUMENT:
                break
self.buff是準備寫入解析後的xml文件的字符串。第一次執行next函數,成功讀取ResourceId Chunk和Namespace Chunk

之後遇到Start Tag Chunk,修改m_event之後退出。在buff字符串寫入u'<?xml version="1.0" encoding="utf-8"?>\n'
然後繼續執行next函數,之後再遇到Start Tag Chunk時不會直接退出,而是執行相應的解析操作。

從理論上將字符串池、ResourceId Chunk和Namespace Chunk都位於第一個Start Tag Chunk之前

而這些chunk也並不會直接出現在解析後的xml文件中。

所以先將他們解析,放入準備好的變量容器。第一次遇到Start Tag Chunk說明之前的內容已經處理完畢了,之後就可以將

Start Tag Chunk解析後的結果寫入結果字符串。


當結果字符串構造完畢之後,輸出到文件或者屏幕即可。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章