MartyConnelly
martyconnelly at shaw.ca
Thu Mar 31 12:09:21 CST 2005
Here are some snippets you may find useful for xml
Multilingual xml test files with phrases of quick brown fox variety
I like the german one
"Falsches Üben von Xylophonmusik quält jeden größeren Zwerg"
"Wrongful practicing of xylophone music tortures every larger dwarf"
http://www5.brinkster.com/mconnelly/xmltest/testUTF-8.xml
http://www5.brinkster.com/mconnelly/xmltest/testUTF-16.xml
To play around you will need the files with proper BOM markers.
http://www5.brinkster.com/mconnelly/xmltest/testUTF-16.zip
This snippet converts a UTF-8 Unicode File to UTF-16 Unicode file.
Actually it will convert to various ISO 8859 encodings as well just
change the charsets.
physically dropping the bad characters, you will have to change the xml
encoding instruction at the start of the output file
manually or via replace code. see the url for method
Sub ReadUTF8SaveFileInUTF16test(strfilepath As String)
'1/2 ReadToFile / SaveToFile snippet
'http://www.codeproject.com/soap/XMJFileStreaming.asp?msg=841289&mode=all&userid=903408#xx767979xx
'drops bad characters in xml entities like hex bb or 99 raquo or trademark
Dim stm As ADODB.Stream
Dim strPath As String
Dim strData As String
Set stm = New ADODB.Stream
stm.Open
stm.Charset = "UTF-8"
stm.Position = 0
stm.Type = adTypeText
'stm.LoadFromFile strfilepath
stm.LoadFromFile "C:\SOAP\GoogleSoap\testgds4uni.xml"
' if you just try and dump out stream
' without reading and writing you get double BOM marker
stm.Position = 0
strData = stm.ReadText()
Debug.Print strData
stm.Position = 0
'the charset names are listed in registry, you may have to check for
spelling and case.
'The character set names for the machine are in the registry
'For a list of the character set strings that is known by a system, see
'the subkeys of HKEY_CLASSES_ROOT\MIME\Database\Charset
'in the Windows Registry.
stm.Charset = "Unicode" '"UTF-8" '"Unicode" ' "Unicode" '"ascii"
'"Big5" '"hebrew" "ISO-8859-1"
stm.WriteText (strData)
stm.SaveToFile "C:\SOAP\GoogleSoap\testgds4unifin.xml",
adSaveCreateOverWrite
stm.Close
Set stm = Nothing
End Sub
Here is some useful test code for xml parsing errors that displays the
actual bad xml character in an error report.
I got tired of seeing bad character Line 110 character 78 then having
to scroll through the xml to find it.
Sub testgdsxmlerror()
Dim xNodes As MSXML2.IXMLDOMNodeList
Dim xNode As MSXML2.IXMLDOMElement
Dim objxml As MSXML2.DOMDocument40
Dim oXMLError As IXMLDOMParseError
Dim strxmlfilepath As String
strxmlfilepath = "C:\SOAP\GoogleSoap\source.xml"
CheckBOM (strxmlfilepath)
Set objxml = CreateObject("Msxml2.DOMDocument")
objxml.Load (strxmlfilepath)
If objxml.parseError.errorCode <> 0 Then
Debug.Print " Reason: " & objxml.parseError.reason
Set oXMLError = objxml.parseError
reportParseError oXMLError
Else
Debug.Print strxmlfilepath & " file OK"
End If
objxml.SetProperty "SelectionNamespaces",
"xmlns:xsl='http://www.w3.org/1999/XSL/Transform'"
objxml.SetProperty "SelectionLanguage", "XPath"
Set xNode = objxml.selectSingleNode("/results[@count]")
Debug.Print xNode.getAttribute("count")
MsgBox xNode.XML
End Sub
Public Function reportParseError(err As IXMLDOMParseError)
'this is not setup to count tabs used as whitespace
Dim s As String
Dim r As String
Dim i As Long
s = ""
For i = 1 To err.linepos - 1
s = s & " "
Next
r = "XML Error loading " & err.URL & vbCrLf & " * " & err.reason
Debug.Print r
'show character postion of error; tired of counting on screen
If (err.Line > 0) Then
r = "at line " & err.Line & ", character " & err.linepos & vbCrLf & _
err.srcText & vbCrLf & s & "^"
End If
Debug.Print r
Debug.Print "url=" & err.URL & vbCrLf
End Function
'determine 2 or 3 byte BOM marker at file beginning
Sub CheckBOM(Optional strFileIn As Variant, Optional strIn As Variant)
'checkbom "C:\XML\Gil Encodings\encUTF8_NoDecl.xml"
On Error GoTo Err_handler
Dim strInputData As String * 4
Dim lpBuffer() As Byte
Dim intFreeFile As Integer
If Not IsMissing(strFileIn) Then
intFreeFile = FreeFile
Open strFileIn For Binary Access Read Lock Read As #intFreeFile Len = 4
ReDim lpBuffer(4)
Get #intFreeFile, , lpBuffer
Close #intFreeFile
ElseIf Not IsMissing(strIn) Then
'Can't makes this work since VBA is always converting the string to
UTF-16LE
lpBuffer = Left$(strIn, 4)
Else
MsgBox "Nothing To Do"
Exit Sub
End If
If lpBuffer(0) = 255 And lpBuffer(1) = 254 Then
Debug.Print "File is UTF-16 Little Endian"
ElseIf lpBuffer(0) = 254 And lpBuffer(1) = 255 Then
Debug.Print "File is UTF-16 Big Endian"
ElseIf lpBuffer(0) = 239 And lpBuffer(1) = 187 And lpBuffer(2) = 191 Then
Debug.Print "File is UTF-8"
'Start trying to figure out by other means this will only work on xml
files that start with "<?"
ElseIf lpBuffer(0) = 60 And lpBuffer(1) = 0 And lpBuffer(2) = 63 And
lpBuffer(3) = 0 Then
Debug.Print "File is UTF-16 Little Endian"
ElseIf lpBuffer(0) = 0 And lpBuffer(1) = 60 And lpBuffer(2) = 0 And
lpBuffer(3) = 63 Then
Debug.Print "File is UTF-16 Big Endian"
ElseIf lpBuffer(0) = 69 And lpBuffer(1) = 63 Then
Debug.Print "File can be in UTF-8, ASCII, ISO-8859-?, Shift-JIS, etc"
Else
Debug.Print "Can't seem to figure out the Character encoding"
End If
Err_Exit:
On Error Resume Next
Close #intFreeFile
Exit Sub
Err_handler:
Select Case err.Number
Case Else
MsgBox err.Number & " - " & err.Description
End Select
Resume Err_Exit:
End Sub
Steve Erbach wrote:
>Marty,
>
>
>
>>>There are the Windows typographic characters known by their HTML4
>>>
>>>
>character entity names, such as —, ‘, ™ and so on (
>
>and " emdash etc.). These have in fact been around for a while, and
>are understood even by a number of older browsers that do not support
>utf-8 <<
>
>Interesting. I'm using Gmail and the first two characters you typed in
>within the parentheses showed up as little boxes. You're right about
>the lack of universality. Gmail tells me that JavaScript is NOT
>enabled in Firefox when I know for a fact that it is...hmmm.
>
>
>
>>>I am scratching my head about this because these windows
>>>
>>>
>typographical characters ANSI 128 -159 as control characters are
>considered illegal characters in XML for example " decimal 153 hex 99
>and should be unicode escaped character
>"™" but some UTF-8 conversion programs don't do this conversion.
>properly so it screws up your xml parsers with illegal characters. I
>am almost tempted to do everything in UTF-16. <<
>
>I've seen a similar thing in an XML file I've been downloading lately.
>I now devote some computer resources to the search for Riemann
>hypothesis zeroes, one of those multiple computer shared resource
>things. Anyway, the website (zetagrid.net) publishes stats on who's
>produced the most zeroes over a period of time. Well, one of those
>lists has a character in it -- Ctrl-Z -- which Access chokes on
>when I try to import the XML file. My XML editor (Peter's XML Editor)
>finds the naughty character and I can erase it and re-save the file.
>
>I take it that you work much more extensively with Unicode than I do
>in my piddly little efforts.
>
>Steve Erbach
>
>
>On Wed, 30 Mar 2005 15:40:22 -0800, MartyConnelly <martyconnelly at shaw.ca> wrote:
>
>
>>Just a word of warning about some of this, you will run into it at some
>>point in time since Unicode files can be UTF-8 or UTF-16.
>>There are the Windows typographic characters known by their HTML4
>>character entity names, such as —, ‘, ™ and so on (
>>and " emdash etc.). These have in fact been around for a while, and are
>>understood even by a number of older browsers that do not support utf-8
>>and would not be able to understand the corresponding unicode
>>&#bignumber; representations. These have been around from before the
>>Unicode standard was set.
>>
>>Now if you consider the Western European "MS-Windows" character set,
>>windows-1252. This is a special cause of confusion: all of the
>>displayable character code values of iso-8859-1 coincide with the same
>>codes in this Windows code - but additionally, the Windows code assigns
>>displayable characters in the area which the iso-8859-n codes reserved
>>for control functions. In unicode, those characters have code values
>>above 256.
>>
>>I am scratching my head about this because these windows typographical
>>characters ANSI 128 -159 as control characters are considered illegal
>>characters in XML
>>for example " decimal 153 hex 99 and should be unicode escaped character
>>"™" but some UTF-8 conversion programs don't do this conversion.
>>properly so it screws up your xml parsers with illegal characters. I am
>>almost tempted to do everything in UTF-16.
>>
>>The windows control characters that cause the problem run from ANSI
>>decimal 128-159.
>>
>>If that isn't enough some little darlings changed the ISO-8859-1 spec to
>>handle the Euro character and you now have to look at Latin-9 or ISO-8859-15
>>
>>I still haven't groked all this yet. I still have to hunt through xml
>>files with a hexeditor to see what is going on..
>>
>>
>
>
>
--
Marty Connelly
Victoria, B.C.
Canada