It is recommended that XML documents contain the XML declaration line (example: <?xml version="1.0" encoding="UTF-8"?>). If your XML document contains this XML declaration line, then this processing instruction is the very first child node (index zero) in the XML document. So, you can check to see if the very first child node is a processing instruction node, and has name xml and if it is, see if it has encoding attribute; if it does, this is the value you are looking for.
If the XML document does not have XML declaration line or the encoding attribute, the workaround in this case is to look for very first bytes in the file. These first bytes are known as Byte Order Mark or BOM. ANSI formatted text files does not contain any BOM bytes; UTF-8 format files contain EF BB BF as the first three bytes and Unicode saved files contain FF FE or FE FF (big endian) as the first two bytes. The following figure shows ANSI, UTF-8, and Unicode formatted files opened in Visual Studio 6.0 in binary mode.
Here is the sample C++ code that:
- First, tries to see if the very first child node is a processing instruction named xml and if it has encoding attribute.
- If above condition is not matched, then it tries to determine the file encoding based on first few bytes.
#include "stdafx.h"
#include "tchar.h"
#include "atlbase.h"
#import <msxml4.dll> named_guids
using namespace MSXML2;
#define CHECK_HR_THROW(hr) { if (FAILED(hr)) { throw -1; } }
int main(int argc, char* argv[])
{
HRESULT hr = S_OK;
USES_CONVERSION;
hr = CoInitialize(NULL);
try
{
CComPtr<IXMLDOMDocument2> spXMLDOMDoc;
hr = spXMLDOMDoc.CoCreateInstance(CLSID_DOMDocument40);
CHECK_HR_THROW(hr);
spXMLDOMDoc->async = VARIANT_FALSE;
spXMLDOMDoc->validateOnParse = VARIANT_FALSE;
bool bEncodingAttributeFound = false;
if(spXMLDOMDoc->load(CComVariant(_T("c:\\books.xml"))) == VARIANT_TRUE)
{
CComPtr<IXMLDOMNode> spDOMNode;
spDOMNode = spXMLDOMDoc->childNodes->item[0];
if(spDOMNode->nodeType == NODE_PROCESSING_INSTRUCTION &&
_tcsicmp(W2A(spDOMNode->nodeName), _T("xml")) == 0)
{
CComPtr<IXMLDOMNamedNodeMap> spXMLDeclAttributes;
spXMLDeclAttributes = spDOMNode->attributes;
printf(_T("XML Document encoding: %s\n"),
W2A(spXMLDeclAttributes->getNamedItem(_T("encoding"))->text ));
bEncodingAttributeFound = true;
}
}
else
{
printf(_T("Failed to load XML file 'c:\\books.xml'. %s\n"),
W2A(spXMLDOMDoc->parseError->reason));
}
if(!bEncodingAttributeFound)
{
printf(_T("\nXML Declaration line/encoding attribute not found!!\n"));
// Encoding Attribute / XML Declaration line not found
// Try using BOM (Byte Order Mark)
FILE* filePtr = fopen(_T("c:\\books.xml"), _T("rb"));
if(filePtr != NULL)
{
byte fileByte[1];
if(fread(fileByte, 1, 1, filePtr) == 1)
{
// See if it is unicode
if(fileByte[0] == (byte)255 || fileByte[0] == (byte)254)
{// FF or FE
if(fread(fileByte, 1, 1, filePtr) == 1)
{
if(fileByte[0] == (byte)254 || fileByte[0] == (byte)255)
{// FE or FF
printf(_T("UNICODE (UTF-16)"));
}
}
}
else
{
// See if file is stored in UTF-8 format
if(fileByte[0] == (byte)239)
{// EF
if(fread(fileByte, 1, 1, filePtr) == 1)
{
if(fileByte[0] == (byte)187)
{// BB
if(fread(fileByte, 1, 1, filePtr) == 1)
{
if(fileByte[0] == (byte)191)
{// BF
printf(_T("UTF-8"));
}
}
}
}
}
else
{
// Assume it is UTF-8 file
printf(_T("UTF-8"));
}
}
}
}
fclose(filePtr);
}
}
catch(...)
{
ATLASSERT(_T("Exception raised!\n"));
}
//getchar();
CoUninitialize();
return 0;
}
|