1G多的XML轉檔

問題:
   存在一個1G多的XML文件,需要將其中的數據,按照一定的邏輯提取數據,做一定的格式化處理和處理邏輯,生成3個不同的文本文件

解決:
   做XML的轉換,有如下的幾個選擇:1. DOM + XSL ;2. DOM + 自己解析 ; 3:SAX + 自己解析。因爲擔心performance的問題,所以決定採用SAX + Stack的方式來處理。因爲要生成3個文件,所以設計3個類來負責分別產生3個文件。然後通過鏈表的方式串接起來,讓他們截獲自己concern的內 容,做進一步的處理。最後各自產生自己負責的文件。


其中:
XmlTextReader : VB.NET的SAX XML Reader。他會逐一讀到XML的指令,COMMENT,開始TAG,文本,結束TAG等內容

Processor : 解析XML,產生文本文件的處理器接口。裏面定義了5個方法。他們分別是,open():用來打開文件;close():用來關閉文 件;startTag():當XmlTextReader讀到開始TAG的時候,就會呼叫Processor的這個方法,將TAG名字,Tag的 Stack,Value的Stack作爲參數傳入;endTag():當XmlTextReader讀到結束TAG的時候,就會呼叫Processor的 這個方法,將TAG名字,Tag的Stack,Value的Stack作爲參數傳入;text():當XmlTextReader讀到文本內容的時候,就 會呼叫Processor的這個方法,將文本值,Tag的Stack,Value的Stack作爲參數傳入

Executor : 作爲XmlTextReader和各種Processor交互的橋樑。在他的內部,各種不同的Processor會以鏈表的形式組合在一塊兒。當 XmlTextReader讀到開始Tag,文本,結束Tag的時候,就會去invoke Excutor的相應方法,而Executor則會逐一invoke 各種processor相應方法,若其中有一procceor處理了某個請求,則終止向後續傳。最後一個DefaultProcessor,他的存在只是 爲了保證tagStask,valueStack的完整性而已。

TProcessor,DProcessor,PProcessor : 負責接收只有自己感興趣的xml內容,然後轉交給TagWrapper,做格式化,業務邏輯處理而已。在startTag(),text()方法裏面,處 理邏輯是,如果是自己關心的內容,則直接入棧;在endTag()方法裏面,處理邏輯是,如果是自己關心的內容,則判斷是否到了一條記錄生成完成的時候 (根據endTag標誌),如果是則將記錄寫入到文件當中,否則將startTag出棧,valueTag出棧,將value,tag名字交給 TagWrapper鏈表,做內部的格式化和業務邏輯處理。

程序的流程如下:


部分程序代碼(VB.NET)

Main的部分代碼
Dim m_xmlr As XmlTextReader

Dim tagStack As New Stack

Dim valueStack As New Stack

Dim executor As Processor = New Executor("c:\TOTFA", "c:\DTLFA", "c:\ORDFA")
executor.open()
m_xmlr = New XmlTextReader("c:\test2.xml")
m_xmlr.WhitespaceHandling = WhitespaceHandling.None
While m_xmlr.Read()
     Select Case m_xmlr.NodeType
           Case XmlNodeType.XmlDeclaration
                Continue While

          Case XmlNodeType.ProcessingInstruction
               Continue While

          Case XmlNodeType.Element
               executor.startTag(m_xmlr.Name, tagStack, valueStack)

          Case XmlNodeType.EndElement
               executor.endTag(m_xmlr.Name, tagStack, valueStack)

          Case XmlNodeType.Text
               executor.text(m_xmlr.Value, tagStack, valueStack)
     End Select
End While
m_xmlr.Close()
executor.close()

 

Executor的代碼
Public Class Executor Implements Processor

Private list As New List(Of Processor)

Public Sub New(ByVal TOTFA As String, ByVal DTLFA As String, ByVal ORDFA As String)
     Dim tp As TProcessor = New TProcessor(TOTFA)
     Dim dp As DProcessor = New DProcessor(DTLFA, tp)
     Dim pp As PProcessor = New PProcessor(ORDFA, tp, dp)
     list.Add(tp)
     list.Add(dp)
     list.Add(pp)
     list.Add(New DefaultProcessor())
End Sub

Function startTag(ByVal tagName As String, ByRef tagStack As Stack, ByRef valueStack As Stack) As      Boolean Implements Processor.startTag
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          If (processor.startTag(tagName, tagStack, valueStack)) Then
               Return True
          End If
     Next
     Return True
End Function

Function endTag(ByVal tagName As String, ByRef tagStack As Stack, ByRef valueStack As Stack) As      Boolean Implements Processor.endTag
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          If (processor.endTag(tagName, tagStack, valueStack)) Then
               Return True
          End If
     Next
     Return True
End Function

Function text(ByVal value As String, ByRef tagStack As Stack, ByRef valueStack As Stack) As                Boolean Implements Processor.text
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          If (processor.text(value, tagStack, valueStack)) Then
               Return True
          End If
     Next
     Return True
End Function

Sub open() Implements Processor.open
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          processor.open()
     Next
End Sub

Sub close() Implements Processor.close
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          processor.close()
     Next
End Sub
End Class

 

TProcessor代碼
Imports System.IO
Imports System.Text
Imports System.Text.RegularExpressions

Public Class TProcessor Implements Processor

Private tagWrapperList As New List(Of TagWrapper)

Private sw As StreamWriter

Private filename As String

Public Sub New(ByVal filename As String)
     Me.filename = filename
     tagWrapperList.Add(New TagWrapper("t1", "欄位說明", " ", 2, 0, FormatterFactory.getInstance().getRBlankFormatter()))
     tagWrapperList.Add(New TagWrapper("t2", "欄位說明", " ", 10, 0, FormatterFactory.getInstance().getRBlankFormatter()))
     tagWrapperList.Add(New TagWrapper("t3", "欄位說明", "00000", 5, 0, FormatterFactory.getInstance().getYearMonthFormatter()))
     tagWrapperList.Add(New TagWrapper("t4", "欄位說明", " ", 1, 0, FormatterFactory.getInstance().getRBlankFormatter()))
     tagWrapperList.Add(New TagWrapper("t5", "欄位說明", " ", 1, 0, FormatterFactory.getInstance().getRBlankFormatter()))
     tagWrapperList.Add(New TagWrapper("t6", "欄位說明", "0000000", 7, 0, FormatterFactory.getInstance().getYearMonthDateFormatter()))
     tagWrapperList.Add(New TagWrapper("t7", "欄位說明", "000000", 6, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t8", "欄位說明", "0000000000", 10, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t9", "欄位說明", "000000", 6, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t10", "欄位說明", "0000000000", 10, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t11", "欄位說明", "000000", 6, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t12", "欄位說明", "0000000000", 10, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t13", "欄位說明", "000000", 6, 0, FormatterFactory.getInstance().getRZeroFormatter()))
End Sub

Public Sub close() Implements Processor.close
     If Not (sw Is Nothing) Then
          sw.Close()
     End If
End Sub

Public Function endTag(ByVal tagName As String, ByRef tagStack As System.Collections.Stack, ByRef valueStack As System.Collections.Stack) As Boolean Implements Processor.endTag
     ‘匹配TProcessor所關心的代碼
     If (Not Regex.IsMatch(tagName, "tdata|t\d+") Or String.Compare(tagName, tagStack.Peek) <> 0) Then
          Return False
     End If

     If (String.Compare(tagStack.Peek, "tdata") = 0 And String.Compare(tagName, tagStack.Peek) = 0) Then
          tagStack.Pop()
          flush()
     ElseIf (valueStack.Count <> 0 And String.Compare(tagStack.Peek, tagName) = 0) Then
          execute(tagStack.Pop(), valueStack.Pop())
     End If
     Return True
End Function

Public Sub open() Implements Processor.open
     sw = New StreamWriter(Me.filename)
End Sub

Public Function startTag(ByVal tagName As String, ByRef tagStack As System.Collections.Stack, ByRef valueStack As System.Collections.Stack) As Boolean Implements Processor.startTag
     If (Not Regex.IsMatch(tagName, "tdata|t\d+")) Then
          Return False
     End If
     tagStack.Push(tagName)
     Return True
End Function

Public Function text(ByVal value As String, ByRef tagStack As System.Collections.Stack, ByRef valueStack As System.Collections.Stack) As Boolean Implements Processor.text
     If (Not Regex.IsMatch(tagStack.Peek, "tdata|t\d+")) Then
          Return False
     End If
     valueStack.Push(value)
     Return True
End Function


Private Sub flush()
     Dim ret As StringBuilder = New StringBuilder("")
     Dim index As Integer
     For index = 0 To tagWrapperList.Count - 1
          Dim tag As TagWrapper = tagWrapperList.Item(index)
          ret.Append(tag.getFormatValue())
     Next
     sw.Write(ret.ToString)
     sw.Flush()
End Sub

Private Sub execute(ByVal tagName As String, ByVal value As String)
     For index = 0 To tagWrapperList.Count - 1
          Dim wrapper As TagWrapper = tagWrapperList.Item(index)
          If (wrapper.accept(tagName)) Then
               wrapper.Value = value
               Exit Sub
          End If
     Next
End Sub

End Class
 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章