如同我之前的一篇文章说的那样,我没有支持DTD与命名空间,
当前实现出来的解析器,只能与xmlhttp对比,因为chrome浏览器解析大文档有bug,至于其他人实现的,我就不一一测试了,既然都决定自己实现了,我只选择大公司的代码做对比。
测试文档大小:3M bytes,约90000个节点。
aqx::xdoc :耗时 70-80ms,内存占用30-40M bytes,30和40主要是32位和64位的区别,如果要追求最少的内存占用,还可以更极端一些,解析速度很难再有本质的提升了,后续要完善的支持,也不会影响解析速度。
xmlhttp: 耗时3000-4000 ms,内存占用约 800M bytes。
目前测试过的系统有:
windows vc++ (vs2017) x86 x64
linux centos7 g++ version(9.1.1) x86 x64
windows中支持3种编码格式:utf-8 utf-16 对应操作系统的ascii编码,在简体中文windows中,也就是通常我们说的gb2312了。
然后目前,我不太可能针对实现细节将原理讲清楚,讲真的,C++的可读性真的非常糟糕,但对于这种需求,还是得用它,这种代码,我自己写完看着不难受,但对别人来说很可能是噩梦,同样的道理,我看别人的C++代码,也会困惑,要让看不懂代码的人,也理解实现细节,这是非常不科学的事。。。
好了,废话不说了,上代码:
//xml.hpp
#pragma once
#include
#include
#include
#include
#pragma warning(push)
#pragma warning(disable:4996)
namespace aqx {
namespace aqx\_internal {
#ifndef __AQX_UTF8_CHAR_LEN
#define __AQX_UTF8_CHAR_LEN
static unsigned char utf8_char_len[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};
#endif
//单字节的字符状态值,对应语法常量
static unsigned short xml\_char\_syntax\[\] = {
0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
8,64,4,0,0,0,0,4,0,0,0,0,0,8208,16,128,
1552,1552,1552,1552,1552,1552,1552,1552,1552,1552,0,256,1,2048,2,0,
0,1072,1072,1072,1072,1072,1072,48,48,48,48,48,48,48,48,48,
48,48,48,48,48,48,48,48,48,48,48,4096,0,0,0,48,
0,1072,1072,1072,1072,1072,1072,48,48,48,48,48,48,48,48,48,
48,48,48,48,48,48,48,48,48,48,48,0,0,0,0,0,
};
namespace XML\_SYNTAX {
//语法常量定义,渣英语,名称定义凑合看吧。
static constexpr auto \_X\_LT{ static\_cast<unsigned short>(0x01) }; // <
static constexpr auto \_X\_GT{ static\_cast<unsigned short>(0x02) }; // >
static constexpr auto \_X\_STRING{ static\_cast<unsigned short>(0x04) }; // ' "
static constexpr auto \_X\_SPACE{ static\_cast<unsigned short>(0x08) }; // \\r\\n\\r空格
static constexpr auto \_X\_NAME{ static\_cast<unsigned short>(0x10) }; // A-Z a-z 0-9 \_ - .
static constexpr auto \_X\_BEGINNAME{ static\_cast<unsigned short>(0x20) }; // A-Z a-z \_
static constexpr auto \_X\_EXCLAM{ static\_cast<unsigned short>(0x40) }; // !
static constexpr auto \_X\_TAGEND{ static\_cast<unsigned short>(0x80) }; // /
static constexpr auto \_X\_ESCAPEEND{ static\_cast<unsigned short>(0x100) }; // ;
static constexpr auto \_X\_NUMBER{ static\_cast<unsigned short>(0x200) }; // 数字0-9
static constexpr auto \_X\_HEX{ static\_cast<unsigned short>(0x400) }; // 16进制0-9 A-F a-f
static constexpr auto \_X\_EQUAL{ static\_cast<unsigned short>(0x800) }; // =
static constexpr auto \_X\_LB{ static\_cast<unsigned short>(0x1000) }; // \[
static constexpr auto \_X\_NEGATIVE{ static\_cast<unsigned short>(0x2000) }; // -
static constexpr auto \_X\_MULTIBYTE{ static\_cast<unsigned short>(0x4000) }; // 多字节字符
}
//保险起见,为了未来考虑,定义一下xml文档的最大长度,时代发展太迅猛,万一我有生之年能用上128bit,到时候也许处理64bit长度的文档就跟我们现在解析小文档一样。
using xml\_size\_t = unsigned int;
static constexpr auto \_xnf{ static\_cast<xml\_size\_t>(-1) };
//这个结构,用来储存转义符位置,以便于快速替换,备用,暂不实现,因为这关乎性能。
struct xml\_escape\_pos { xml\_size\_t pos, len; };
template<typename \_XtsTy>
class xparser\_t;
//xml文本迭代器的基本模板类
template<typename \_Ty>
class xts\_t {
public:
using Basetype = \_Ty;
protected:
const \_Ty \*text;
xml\_size\_t size;
xml\_size\_t index;
\_Ty c;
unsigned char cl;
unsigned short s;
unsigned short flags;
};
//解析错误信息结构
//解析时不处理行,列问题,有错误发生时后处理,因为,行,列处理,会使解析速度慢差不多一倍。
struct xerrorpos {
xml\_size\_t pos;
int number;
std::string information;
xml\_size\_t line;
xml\_size\_t column;
};
//这两个结构用来储存一些字符串常量,实现两种字符串格式的快速引用,这两个结构绑定到三种xts类中
struct xmultybyte\_constvalue {
static constexpr const char \*emp = "";
static constexpr const char \*br\_tag = "<br/>";
static constexpr const char \*crlf = "\\r\\n";
static constexpr const char \*end\_tag\_syntax = "</";
static constexpr const char \*autoend\_tag\_syntax = "/>";
static constexpr const char \*comment\_end = "--";
static constexpr const char \*cdata\_end = "\]\]>";
};
struct xwidechar\_constvalue {
static constexpr const wchar\_t \*emp = L"";
static constexpr const wchar\_t \*br\_tag = L"<br/>";
static constexpr const wchar\_t \*crlf = L"\\r\\n";
static constexpr const wchar\_t \*end\_tag\_syntax = L"</";
static constexpr const wchar\_t \*autoend\_tag\_syntax = L"/>";
static constexpr const wchar\_t \*comment\_end = L"--";
static constexpr const wchar\_t \*cdata\_end = L"\]\]>";
};
//utf8的文本迭代器,先基于这个来实现
class xts\_utf8 : public xts\_t<char>
{
public:
using strtype = std::string;
static constexpr int \_encoding{ 2 };
using constval = xmultybyte\_constvalue;
//初始化
void init(const char \*\_Text, xml\_size\_t \_Size) {
text = \_Text;
size = \_Size;
index = 0;
c = text\[0\];
cl = utf8\_char\_len\[(unsigned char)c\];
s = (cl != 1) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
//处理下一个字符
void next() {
index += cl;
c = text\[index\];
cl = utf8\_char\_len\[(unsigned char)c\];
s = (cl != 1) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
//向前回退n个字符,目前,只有在根节点之前的处理,有用到这个
void back(xml\_size\_t len) {
index -= len;
c = text\[index\];
cl = utf8\_char\_len\[(unsigned char)c\];
s = (cl != 1) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
//next,并判断语法
bool next\_is\_flags() {
next();
return (flags & s) != 0;
}
//next, 并判断下一个字符的值
bool next\_is\_char(char \_Chr) {
next();
return \_Chr == c;
}
//解析错误时,用于获取行,列。
void next\_donot\_syntax() {
index += cl;
c = text\[index\];
cl = utf8\_char\_len\[(unsigned char)c\];
}
//设置允许的语法
void set\_flags(unsigned short \_Flags) {
flags = \_Flags;
}
private:
friend class xparser\_t<xts\_utf8>;
};
//asc的文本迭代器
class xts\_asc : public xts\_t<char>
{
public:
using strtype = std::string;
static constexpr int \_encoding{ 0 };
using constval = xmultybyte\_constvalue;
//初始化
void init(const char \*\_Text, xml\_size\_t \_Size) {
text = \_Text;
size = \_Size;
index = 0;
c = text\[0\];
cl = ((unsigned short)c >= 0x80) ? 2 : 1;
s = (cl != 1) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
//处理下一个字符
void next() {
index += cl;
c = text\[index\];
cl = ((unsigned short)c >= 0x80) ? 2 : 1;
s = (cl != 1) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
//向前回退n个字符,目前,只有在根节点之前的处理,有用到这个
void back(xml\_size\_t len) {
index -= len;
c = text\[index\];
cl = ((unsigned short)c >= 0x80) ? 2 : 1;
s = (cl != 1) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
//next,并判断语法
bool next\_is\_flags() {
next();
return (flags & s) != 0;
}
//next, 并判断下一个字符的值
bool next\_is\_char(char \_Chr) {
next();
return \_Chr == c;
}
void next\_donot\_syntax() {
index += cl;
c = text\[index\];
cl = ((unsigned short)c >= 0x80) ? 2 : 1;
}
//设置允许的语法
void set\_flags(unsigned short \_Flags) {
flags = \_Flags;
}
private:
friend class xparser\_t<xts\_asc>;
};
class xts\_utf16 : public xts\_t<wchar\_t> {
public:
using strtype = std::wstring;
static constexpr int \_encoding{ 1 };
using constval = xwidechar\_constvalue;
xts\_utf16() {
//utf16的字符不是变长的,固定为1
//虽然有4字节的utf16字符,但影响不到最终解析逻辑。
cl = 1;
}
private:
void init(const wchar\_t \*\_Text, xml\_size\_t \_Size) {
text = \_Text;
size = \_Size;
index = 0;
c = text\[0\];
cl = 1;
s = ((unsigned short)c >= 0x80) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
//处理下一个字符
void next() {
c = text\[++index\];
s = ((unsigned short)c >= 0x80) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
void back(xml\_size\_t len) {
index -= len;
c = text\[index\];
s = ((unsigned short)c >= 0x80) ?
XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :
xml\_char\_syntax\[(unsigned char)c\];
}
//next,并判断语法
bool next\_is\_flags() {
next();
return (flags & s) != 0;
}
//next, 并判断下一个字符的值
bool next\_is\_char(char \_Chr) {
next();
return \_Chr == c;
}
void next\_donot\_syntax() {
c = text\[++index\];
}
//设置允许的语法
void set\_flags(unsigned short \_Flags) {
flags = \_Flags;
}
private:
friend class xparser\_t<xts\_utf16>;
};
template<typename \_XtsTy>
class xdocument\_t;
template<typename \_Ty>
class xelement\_t;
template<typename \_Ty>
class xresource\_t {
public:
//xml节点数据结构,因为结构相互依赖的原因,所以嵌套在一起
using Basetype = typename \_Ty::value\_type;
class xnode;
using xtagindex\_t = std::list<xnode\*>;
using xtagindex\_ref = typename xtagindex\_t::iterator;
using xdoctext\_t = std::list<\_Ty>;
using xattrname\_t = std::set<\_Ty>;
using xattrvalue\_t = std::map<\_Ty, xml\_size\_t>;
using xtagtext\_t = std::map<\_Ty, xtagindex\_t>;
using xdoctext\_ref = typename xdoctext\_t::iterator;
using xtagtext\_ref = typename xtagtext\_t::iterator;
using xattrname\_ref = typename xattrname\_t::iterator;
using xattrvalue\_ref = typename xattrvalue\_t::iterator;
class xnode {
public:
using \_Self\_Reftype = typename std::list<xnode>::iterator;
xnode() {
parent = nullptr;
}
xnode(xnode \*\_Parent, xresource\_t \*\_Resource) {
parent = \_Parent;
ti.doc\_body\_ref = inner.end = inner.begin = \_Resource->docs.end();
}
private:
void refactor\_tag\_body(int \_Style, xml\_size\_t \_PreSize, xresource\_t \*\_Resource)
{
\_Ty &\_Tmp = \_Resource->refactor\_buffer;
\_Tmp.clear();
\_Tmp.reserve(\_PreSize);
\_Tmp += (Basetype)'<';
\_Tmp += ti.name->first;
for (auto it = attrs.begin(); it != attrs.end(); ++it) {
\_Tmp += (Basetype)' ';
\_Tmp += \*it->name;
\_Tmp += (Basetype)'=';
\_Tmp += (Basetype)it->st;
\_Tmp += it->value->first;
\_Tmp += (Basetype)it->st;
}
if (\_Style == 2)
\_Tmp += (Basetype)'/';
\_Tmp += (Basetype)'>';
if (ti.doc\_body\_ref == \_Resource->docs.end()) {
\_Resource->docs.push\_back(\_Tmp);
ti.doc\_body\_ref = --(\_Resource->docs.end());
parent->inner.end = ti.doc\_body\_ref;
if (parent->inner.begin == \_Resource->docs.end())
parent->inner.begin = ti.doc\_body\_ref;
}
else
{
\*ti.doc\_body\_ref = \_Tmp;
}
}
private:
friend class xresource\_t;
friend class xparser\_t<xts\_utf8>;
friend class xparser\_t<xts\_utf16>;
friend class xparser\_t<xts\_asc>;
friend class xdocument\_t<xts\_utf8>;
friend class xdocument\_t<xts\_utf16>;
friend class xdocument\_t<xts\_asc>;
friend class xelement\_t<xts\_utf8>;
friend class xelement\_t<xts\_utf16>;
friend class xelement\_t<xts\_asc>;
struct xattr { xattrname\_ref name; xattrvalue\_ref value; char st; };
struct tag\_info {
xtagtext\_ref name;//标签名称
xtagindex\_ref name\_index\_ref;//在标签名称索引中的引用,本质上其实就是个指针
xdoctext\_ref doc\_body\_ref;//整个标签信息,包含属性 <t ...> 在文档中的实体
}ti;
std::list<xattr> attrs;
struct xinner { xdoctext\_ref begin, end; }inner;
std::list<xnode> child;
xnode \*parent;
\_Self\_Reftype self;
};
xresource\_t() {
//预定义的几个转义符实体:lt gt amp quot apos
escape\_bodys\[{ (Basetype)'l', (Basetype)'t'}\] = { (Basetype)'<' };
escape\_bodys\[{ (Basetype)'g', (Basetype)'t' }\] = { (Basetype)'>' };
escape\_bodys\[{ (Basetype)'a', (Basetype)'m', (Basetype)'p' }\] = { (Basetype)'&' };
escape\_bodys\[{ (Basetype)'q', (Basetype)'u', (Basetype)'o', (Basetype)'t' }\] = { (Basetype)'"' };
escape\_bodys\[{ (Basetype)'a', (Basetype)'p', (Basetype)'o', (Basetype)'s' }\] = { (Basetype)'\\'' };
}
void clear() {
root.child.clear();
docs.clear();
tags.clear();
attr\_names.clear();
attr\_values.clear();
root.parent = nullptr;
root.ti.doc\_body\_ref = root.inner.end = root.inner.begin = docs.end();
}
xnode root;
xdoctext\_t docs;
xtagtext\_t tags;
xattrname\_t attr\_names;
xattrvalue\_t attr\_values;
std::map<\_Ty, \_Ty> escape\_bodys;
\_Ty refactor\_buffer;
};
template<typename \_XtsTy>
class xparser\_t {
public:
using \_StringTy = typename \_XtsTy::strtype;
using Basetype = typename \_XtsTy::Basetype;
xparser\_t() {
// 这里忽悠一下编译器,自动根据类型选择:strstr 或 wcsstr
typedef const char \*(\*STRSTRFUNC)(const char \*, const char \*);
typedef const wchar\_t \*(\*WSTRSTRFUNC)(const wchar\_t \*, const wchar\_t \*);
typedef const Basetype \*(\*MYSTRSTRFUNC)(const void \*, const void \*);
\_\_multiec\_strstr = ((sizeof(Basetype) == 1) ?
((MYSTRSTRFUNC)((STRSTRFUNC)strstr)) :
((MYSTRSTRFUNC)((WSTRSTRFUNC)wcsstr)));
}
private:
void x\_escape\_number()
{
//数值类型的unicode字符转义处理
//这里我是自己实现的字符串转换数字,
//因为使用C标准转换需要额外拷贝一次 & 到 ; 字符串,为了避免这个拷贝,就要临时改变转义符结束符 ; 的位置为0来给strtol去计算
//而在之后的dom类的load\_string设计中,很可能会直接允许static const char \*xxx= "...";这样的东西传入到这里进行解析。
//在windows中,数据段的静态常数成员是的内存页面保护是PAGE\_EXECUTE\_READ,不能写操作。
//所以我在这里简单实现了字符串 => 数字。
xml\_size\_t ebgn = xts.index - 1;
long long x = 0;
if (!xts.next\_is\_char('x')) {
// # 后面如果不是x,就按10进制的规则来处理
if(!(xts.s & XML\_SYNTAX::\_X\_NUMBER)) err(xts.index, 22);
xts.set\_flags(XML\_SYNTAX::\_X\_NUMBER | XML\_SYNTAX::\_X\_ESCAPEEND);
for (;;) {
x = (x \* 10) + (xts.c - '0');
if (!xts.next\_is\_flags()) err(xts.index, 22);
if (xts.c == ';')
break;
}
}
else
{
// # 后面是x,按16进制处理
xts.set\_flags(XML\_SYNTAX::\_X\_HEX);
if (!xts.next\_is\_flags()) err(xts.index, 23);
xts.set\_flags(XML\_SYNTAX::\_X\_HEX | XML\_SYNTAX::\_X\_ESCAPEEND);
int i = 0;
for (;; i++) {
long long \_Tmp;
switch (xts.c) {
case '0':case '1':case '2':case '3':case '4':case '5':case '6':case '7':case '8':case '9':
\_Tmp = xts.c - '0';
break;
case 'a':case 'b':case 'c':case 'd':case 'e':case 'f':
\_Tmp = xts.c - 'a' + 10;
break;
case 'A':case 'B':case 'C':case 'D':case 'E':case 'F':
\_Tmp = xts.c - 'A' + 10;
break;
}
x += (\_Tmp << (i << 2));
if (!xts.next\_is\_flags()) err(xts.index, 23);
if (xts.c == ';')
break;
}
//由于上面的十六进制数字计算顺序是反的,所以要从最高有效位来倒转
long long y = 0;
for (int k = 0; k <= i; k++)
y += ((x >> (k << 2)) & 0x0F) << ((i - k) << 2);
x = y;
}
if (x < 0x20) {
switch (x) {
case '\\t':case '\\r':case '\\n':
break;
default:
err(ebgn, 24);
}
}
else if (x > 0xD800 && x < 0xDFFF)
err(ebgn, 25);
else if (x > 0x10FFFF)
err(ebgn, 26);
}
void x\_escape\_body()
{
xml\_size\_t nbgn = xts.index;
for (;;) {
xts.next();
if (xts.c == ';') {
break;
}
else {
if (!(xts.s & XML\_SYNTAX::\_X\_BEGINNAME)) err(xts.index, 19);
}
}
\_StringTy &\_Tmp = \_strtmp\[4\];
\_Tmp.assign(xts.text + nbgn, xts.index - nbgn);
auto it = res->escape\_bodys.find(\_Tmp);
if (it == res->escape\_bodys.end()) {
errinfobuffer.reserve(\_Tmp.length() \* 3);
int n = sprintf((char\*)errinfobuffer.data(),
((sizeof(Basetype) != 1) ? "%ls" : "%s"),
\_Tmp.c\_str());
err(nbgn, 20, errinfobuffer.c\_str());
}
}
void x\_escape() {
xts.next();
if (xts.c == '#') {
x\_escape\_number();
}
else
{
if (!(xts.s & XML\_SYNTAX::\_X\_BEGINNAME)) err(xts.index, 18);
x\_escape\_body();
}
}
void x\_cdata() {
xml\_size\_t cbgn = xts.index - 2;
const char \*pcdata = "CDATA\[";
for (int i = 0; i < 6; i++) {
if (!xts.next\_is\_char(pcdata\[i\]))
err(xts.index, 16);
}
// CDATA的结束符比注释标签还要省事,直接向后搜索\]\]>
const Basetype \*p = \_\_multiec\_strstr(xts.text + xts.index + 1, \_XtsTy::constval::cdata\_end);
if (p) {
xts.index = (xml\_size\_t)(p - xts.text + 3);
res->docs.push\_back(\_StringTy(xts.text + cbgn, xts.index - cbgn));
cur->inner.end = --(res->docs.end());
if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;
}
else
{
err(cbgn, 17);
}
}
void x\_comment() {
xml\_size\_t cbgn = xts.index - 2;
if (!xts.next\_is\_char('-')) err(xts.index, 13);
/\*
不太清楚为什么xml注释中不允许存在--,我反正照做了。
从代码此处看,实际上是可以允许的,就像是CDATA的结束符那样。
utf8的情况下,无法双字搜索。
utf16的情况下,也无法4字节搜索。
例如这种情况:
<--a-->,如果双字搜索,从a开始,有一个-就被忽略掉了,如果要判断这个问题,那实际上和单字节搜索一样的性能。
\*/
const Basetype \*p = \_\_multiec\_strstr(xts.text + xts.index + 1, \_XtsTy::constval::comment\_end);
if (p) {
if (p\[2\] == '>') {
xts.index = (xml\_size\_t)(p - xts.text + 3);
res->docs.push\_back(\_StringTy(xts.text + cbgn, xts.index - cbgn));
cur->inner.end = --(res->docs.end());
if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;
}
else
{
err((xml\_size\_t)(p - xts.text), 15);
}
}
else
{
err(cbgn, 14);
}
}
void x\_specifics\_tag() {
//特殊标签,共有两个分支,注释和CDATA,DTD在根节点之前处理,不会进入这里
xts.set\_flags(XML\_SYNTAX::\_X\_LB | XML\_SYNTAX::\_X\_NEGATIVE);
if (!xts.next\_is\_flags()) err(xts.index, 2);
if (xts.c == '-')
x\_comment();
else
x\_cdata();
}
void x\_end\_node() {
//结束标签处理
xts.set\_flags(XML\_SYNTAX::\_X\_BEGINNAME);
if (!xts.next\_is\_flags()) err(xts.index, 10);
xml\_size\_t nbgn = xts.index;
xml\_size\_t nend;
xts.set\_flags(XML\_SYNTAX::\_X\_NAME | XML\_SYNTAX::\_X\_SPACE | XML\_SYNTAX::\_X\_GT);
bool \_BackSpace = false;
for (;;) {
if (!xts.next\_is\_flags()) err(xts.index, 10);
if (!(xts.s & XML\_SYNTAX::\_X\_NAME)) {
nend = xts.index;
if (xts.s & XML\_SYNTAX::\_X\_SPACE)
\_BackSpace = true;
break;
}
}
if (\_BackSpace) {
//后面还有空格
xts.set\_flags(XML\_SYNTAX::\_X\_SPACE | XML\_SYNTAX::\_X\_GT);
for (;;) {
if (!xts.next\_is\_flags()) err(xts.index, 11);
if (xts.c == '>')
break;
}
}
\_StringTy &tmp = \_strtmp\[0\];
tmp.reserve(nend - nbgn + 0x10);
tmp.assign(xts.text + nbgn, nend - nbgn);
if (tmp != cur->ti.name->first) {
errinfobuffer.reserve((tmp.length() + cur->ti.name->first.length()) \* 3 + 0x20);
int n = sprintf((char\*)errinfobuffer.data(),
((sizeof(Basetype) != 1) ? "%ls 与 %ls 不一致" : "%s 与 %s 不一致"),
tmp.c\_str(), cur->ti.name->first.c\_str());
err(nbgn, 12, errinfobuffer.c\_str());
}
if (cur->inner.begin == res->docs.end()) {
//如果这个节点的内容为空,说明,它跟一个自结束的节点没有区别
//直接在父节点中将它修改一个自结束节点即可。
cur->parent->inner.end->erase(cur->parent->inner.end->length() - 1);
cur->parent->inner.end->append(\_XtsTy::constval::autoend\_tag\_syntax);
}
else
{
res->docs.push\_back(\_XtsTy::constval::end\_tag\_syntax);
auto it = --(res->docs.end());
it->append(tmp);
(\*it) += (Basetype)'>';
cur->inner.end = it;
}
cur = cur->parent;
}
int x\_tag\_name() {
auto new\_name = \[this\](xml\_size\_t left, xml\_size\_t right) {
\_StringTy &tmp = \_strtmp\[0\];
tmp.assign(xts.text + left, right - left);
auto it = res->tags.find(tmp);
if (it == res->tags.end())
it = res->tags.insert({ tmp, std::list<\_Nodetype\*>() }).first;
it->second.push\_back(cur);
cur->ti.name = it;
cur->ti.name\_index\_ref = (--(it->second.end()));
};
xts.set\_flags(
XML\_SYNTAX::\_X\_NAME | //符合名称规范的字符
XML\_SYNTAX::\_X\_GT | // >
XML\_SYNTAX::\_X\_TAGEND | // /自结束标签
XML\_SYNTAX::\_X\_SPACE // 空白字符
);
xml\_size\_t name\_begin = xts.index;
for (;;) {
if (!xts.next\_is\_flags()) err(xts.index, 1);
switch (xts.c) {
case '>':
new\_name(name\_begin, xts.index);
return 1;
case '/':
if (!xts.next\_is\_char('>')) err(xts.index, 4);
new\_name(name\_begin, xts.index - 1);
return 2;
default:
if (xts.s & XML\_SYNTAX::\_X\_SPACE) {
new\_name(name\_begin, xts.index);
return 0;
}
break;
}
}
}
bool x\_attr\_name(\_StringTy &\_Name) {
xts.set\_flags(
XML\_SYNTAX::\_X\_NAME | //符合名称规范的字符
XML\_SYNTAX::\_X\_EQUAL | //等于号
XML\_SYNTAX::\_X\_SPACE // 空白字符
);
xml\_size\_t name\_begin = xts.index;
for (;;) {
if (!xts.next\_is\_flags()) err(xts.index, 5);
if (xts.s & (XML\_SYNTAX::\_X\_EQUAL | XML\_SYNTAX::\_X\_SPACE)) {
\_Name.assign(xts.text + name\_begin, xts.index - name\_begin);
return xts.c == '=';
}
}
return false;
}
char x\_attr\_value(\_StringTy &\_Value) {
xts.set\_flags(
XML\_SYNTAX::\_X\_STRING | //字符串 " '
XML\_SYNTAX::\_X\_SPACE // 空白字符
);
char \_Style;
for (;;) {
if (!xts.next\_is\_flags()) err(xts.index, 7);
if (xts.s & XML\_SYNTAX::\_X\_STRING) {
\_Style = (char)xts.c;
break;
}
}
xml\_size\_t value\_begin = xts.index + 1;
if (\_Style == '"') {
for (;;) {
xts.next();
switch (xts.c) {
case 0:
err(xts.index, 8);
case '<':
err(xts.index, 9);
case '&':
//处理转义符
x\_escape();
break;
case '"':
//字符串结束
\_Value.assign(xts.text + value\_begin, xts.index - value\_begin);
return \_Style;
default:
break;
}
}
}
else
{
for (;;) {
xts.next();
switch (xts.c) {
case 0:
err(xts.index, 8);
case '<':
err(xts.index, 9);
case '&':
//处理转义符
x\_escape();
break;
case '\\'':
//字符串结束
\_Value.assign(xts.text + value\_begin, xts.index - value\_begin);
return \_Style;
default:
break;
}
}
}
return \_Style;
}
void x\_attr(xml\_size\_t &\_Presize) {
\_StringTy &name = \_strtmp\[0\];
\_StringTy &value = \_strtmp\[1\];
if (!x\_attr\_name(name)) {
//x\_attr\_name中没有找到等于号,对应这种: <a x =...
xts.set\_flags(
XML\_SYNTAX::\_X\_EQUAL | //等号
XML\_SYNTAX::\_X\_SPACE // 空白字符
);
for (;;) {
if (!xts.next\_is\_flags()) err(xts.index, 7);
if (xts.c == '=') break;
}
}
char \_Style = x\_attr\_value(value);
\_Presize += (xml\_size\_t)(name.length() + value.length() + 6);
auto itn = res->attr\_names.find(name);
if (itn == res->attr\_names.end())
itn = res->attr\_names.insert(name).first;
auto itv = res->attr\_values.find(value);
if (itv == res->attr\_values.end())
itv = res->attr\_values.insert({ value, 1 }).first;
cur->attrs.push\_back({ itn, itv, \_Style });
}
int x\_preattr(xml\_size\_t &\_Presize) {
/\*
x\_tag\_name里没有找到 > 的情况下,在标签属性解析开始之前,
对应下面这几种情况:
<a >
<a />
<a x=...
\*/
xts.set\_flags(
XML\_SYNTAX::\_X\_SPACE | //空白字符
XML\_SYNTAX::\_X\_GT | // >
XML\_SYNTAX::\_X\_BEGINNAME | //名称首字符
XML\_SYNTAX::\_X\_TAGEND // /自结束标签
);
for (;;) {
if (!xts.next\_is\_flags()) err(xts.index, 5);
switch (xts.c) {
case '>':
return 1;
case '/':
if (!xts.next\_is\_char('>')) err(xts.index, 4);
return 2;
default:
if (xts.s & XML\_SYNTAX::\_X\_BEGINNAME) {
x\_attr(\_Presize);
xts.set\_flags(
XML\_SYNTAX::\_X\_SPACE | // 空白字符
XML\_SYNTAX::\_X\_GT | // >
XML\_SYNTAX::\_X\_BEGINNAME | // 名称首字符
XML\_SYNTAX::\_X\_TAGEND // /自结束标签
);
}
break;
}
}
}
void x\_new\_node() {
cur->child.push\_back(\_Nodetype(cur, res));
auto it = (--cur->child.end());
cur = &(\*it);
cur->self = it;
int n = x\_tag\_name();
xml\_size\_t \_PreSize = (xml\_size\_t)(cur->ti.name->first.length() + 3);
if (!n) n = x\_preattr(\_PreSize);
cur->refactor\_tag\_body(n, \_PreSize, res);
if (n == 2)
cur = cur->parent;
}
void x\_tag() {
//标签开始后,下一个字符只能是 符合名称规范的第一个字符,感叹号 !,结束标签 /
xts.next();
switch (xts.c) {
case '!':
x\_specifics\_tag();
break;
case '/':
x\_end\_node();
break;
default:
if (!(xts.s & XML\_SYNTAX::\_X\_BEGINNAME)) err(xts.index, 1);
x\_new\_node();
break;
}
}
void x\_text() {
//标签之外的有效文本处理
xml\_size\_t tbegin = \_xnf;
\_StringTy &tmp = \_strtmp\[3\];
tmp.clear();
for (;;) {
xts.next();
switch (xts.c) {
case 0:
return;
case '&':
if (tbegin == \_xnf)
tbegin = xts.index;
x\_escape();
break;
case '<':
//处理标签之前,先处理有效文本
if (tbegin != \_xnf) {
if (tmp.length()) tmp += ' ';
tmp.append(xts.text + tbegin, xts.index - tbegin);
tbegin = \_xnf;
}
if (tmp.length()) {
res->docs.push\_back(tmp);
cur->inner.end = --(res->docs.end());
if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;
tmp.clear();
}
x\_tag();
break;
default:
if (!(xts.s & XML\_SYNTAX::\_X\_SPACE)) {
if (tbegin == \_xnf)
tbegin = xts.index;
}
else
{
//遇到空白字符时,如果有效文本开始位置已经记录过了,则将这一段有效的东西添加到有效文本结
if (tbegin != \_xnf) {
if (tmp.length()) tmp += ' ';
tmp.append(xts.text + tbegin, xts.index - tbegin);
tbegin = \_xnf;
}
}
}
}
}
void x\_dtd() {
xml\_size\_t pos = xts.index - 1;
const char \*p = "OCTYPE";
for (int i = 0; i < 6; i++) {
if (!xts.next\_is\_char(p\[i\])) err(pos, 2);
}
int n = 1;
int \_StrType = 0;
for (;;) {
xts.next();
switch (xts.c) {
case '"':
case '\\'':
if (!\_StrType)
\_StrType = xts.c;
else if (\_StrType == xts.c)
\_StrType = 0;
break;
case '<':
n++;
break;
case '>':
if (!\_StrType) {
if (!(--n)) {
res->docs.push\_back(\_StringTy(xts.text + pos, xts.index - pos + 1));
cur->inner.end = --(res->docs.end());
if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;
//wprintf(L"%s\\n", cur->inner.end->c\_str());
return;
}
}
break;
default:
break;
}
}
}
void x\_declare() {
xml\_size\_t pos = xts.index - 1;
int \_StrType = 0;
for (;;) {
xts.next();
switch (xts.c) {
case '"':
case '\\'':
if (!\_StrType)
\_StrType = xts.c;
else if (\_StrType == xts.c)
\_StrType = 0;
break;
case '?':
if (!\_StrType) {
if (!xts.next\_is\_char('>')) err(xts.index, 30);
res->docs.push\_back(\_StringTy(xts.text + pos, xts.index - pos + 1));
cur->inner.end = --(res->docs.end());
if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;
return;
}
break;
default:
break;
}
}
}
int x\_root() {
if (setjmp(\_Rem)) return -1;
xts.set\_flags(XML\_SYNTAX::\_X\_LT | XML\_SYNTAX::\_X\_SPACE);
bool root\_break = false;
for (;;) {
if (xts.c == '<') {
xts.next();
switch (xts.c) {
case '!':
xts.next();
if (xts.c == '-')
x\_comment();
else if (xts.c == 'D')
x\_dtd();
else
err(xts.index, 2);
xts.set\_flags(XML\_SYNTAX::\_X\_LT | XML\_SYNTAX::\_X\_SPACE);
break;
case '?':
if (xts.index != 1)
err(xts.index, 31);
x\_declare();
xts.set\_flags(XML\_SYNTAX::\_X\_LT | XML\_SYNTAX::\_X\_SPACE);
break;
default:
if (xts.s & XML\_SYNTAX::\_X\_BEGINNAME) {
xts.back(1);
x\_tag();
x\_text();
root\_break = true;
}
else
{
err(xts.index, 2);
}
break;
}
if (root\_break)
break;
}
if (!xts.next\_is\_flags())
err(xts.index, 3);
}
if (cur != &(res->root)) {
//解析完字符串之后,如果当前标签不为null,则属于错误。
err(xts.size, 28);
}
return 0;
}
void err(xml\_size\_t \_Pos, int \_Number, const char \*\_Info = "") {
errp = { \_Pos, \_Number, \_Info };
longjmp(\_Rem, 1);
}
public:
int load(const Basetype \*\_Text, int \_Size, xresource\_t<\_StringTy> \*pres) {
xts.init((const Basetype\*)\_Text, \_Size);
errp.number = 0;
errp.pos = 0;
res = pres;
cur = &(res->root);
return x\_root();
}
void get\_errp(xerrorpos &e) {
e = errp;
}
void get\_err\_pos(xerrorpos &e) {
e.line = 0;
e.column = 0;
if (!e.pos || !e.number)
return;
auto pos = xts.index - xts.cl;
xts.index = 0;
xts.c = xts.text\[0\];
e.line = 1;
e.column = 1;
for (; xts.index < e.pos; xts.next\_donot\_syntax())
{
if (xts.c == '\\n') {
e.line++;
e.column = 1;
}
else
{
e.column++;
}
}
xts.index = pos;
xts.next();
}
private:
friend class xelement\_t<\_XtsTy>;
jmp\_buf \_Rem;
\_XtsTy xts;
xerrorpos errp;
using \_Nodetype = typename xresource\_t<\_StringTy>::xnode;
\_Nodetype \*cur;
xresource\_t<\_StringTy> \*res;
\_StringTy \_strtmp\[8\];//由于使用了jmp\_buf来进行错误直接远跳,为了避免内存泄漏,所以将栈中需要的字符串对象也储存在这里
const Basetype\*(\*\_\_multiec\_strstr)(const void\*, const void\*);
std::string errinfobuffer;
};
static const char \*xml\_error\_information\[\] = {
"",
"开始标签:无效的元素名称", //1
"根节点之前的无效的特殊标签", //2
"根节点之前的无效的字符", //3
"自结束标签:此处应为 >", //4
"标签属性:无效的标签属性名称", //5
"标签属性:此处应为 =", //6
"标签属性:此处应为 \\" 或 '", //7
"标签属性:未找到对应的属性结束符(\\" 或 ')", //8
"标签属性:< 不允许出现在属性值中", //9
"结束标签:无效的元素名称", //10
"结束标签:此处应为 >", //11
"结束标签:开始标签与结束标签不匹配,参考信息:%s", //12
"注释标签:无效的注释标签,此处或许应为 -",//13
"注释标签:未找到注释标签结束符(-->)",//14
"注释标签:-- 不允许单独出现在注释标签中",//15
"CDATA:无效的CDATA标签",//16
"CDATA:未找到CDATA结束符(\]\]>)",//17
"转义符:无效的转义符名称首字符",//18
"转义符:无效的转义符字符",//19
"转义符:%s 是未定义的实体",//20
"转义符:无效的转义符字符",//21
"字符数值转义:无效的10进制数字字符",//22
"字符数值转义:无效的16进制数字字符",//23
"字符数值转义:小于32(0x20)的字符仅允许\\\\t\\\\r\\\\n出现在xml中",//24
"字符数值转义:0xD800-0xDFFF为UNICODE代理字符,不允许单独出现在xml中",//25
"字符数值转义:字符值溢出,参考最大值(0x10FFFF)",//26
"转义符:无效的转义符",//27
"根节点未封闭",//28
"无效的文档:%s",//29
"XML声明种错误的符号,此处应为 >",//30
"XML声明前不允许存在其他字符",//31
"未找到XML声明结束符(?>)",//32
};
template<typename \_XtsTy>
class xelement\_t {
public:
using \_StringTy = typename \_XtsTy::strtype;
using \_Nodetype = typename xresource\_t<\_StringTy>::xnode;
using Basetype = typename xresource\_t<\_StringTy>::Basetype;
bool eof() {
return \_Node == nullptr;
}
xelement\_t(\_Nodetype \*\_Val) {
\_Node = \_Val;
}
bool operator==(xelement\_t &e) {
return e.\_Node == \_Node;
}
bool operator!=(xelement\_t &e) {
return e.\_Node != \_Node;
}
\_StringTy get\_name() {
if (eof()) return \_XtsTy::constval::emp;
return \_Node->ti.name->first;
}
\_StringTy get\_attr(const \_StringTy &\_AttrName) {
if (eof()) return \_XtsTy::constval::emp;
for (auto it = \_Node->attrs.begin(); it != \_Node->attrs.end(); ++it) {
if (\*(it->name) == \_AttrName)
return it->value->first;
}
return "";
}
\_StringTy get\_text(int \_Flags = 0) {
if (eof()) return \_XtsTy::constval::emp;
\_StringTy \_Tmp;
auto begin = \_Node->inner.begin;
auto end = \_Node->inner.end;
if (\_Flags & 1) {
--begin;
++end;
}
for (auto it = begin; it != end; ++it) {
if (it->length() > 6 && it->at(0) == '<' && it->at(1) == '!' && it->at(2) == '-')
continue;
\_Tmp += it->c\_str();
}
return \_Tmp;
}
\_StringTy get\_inner\_xml() {
if (eof()) return \_XtsTy::constval::emp;
\_StringTy \_Tmp;
for (auto it = \_Node->inner.begin; it != \_Node->inner.end; ++it) {
if (it->length() > 3 &&
it->at(0) == '<' &&
it->at(1) == '!' &&
it->at(2) == '-')
continue;
if (it->length() > 4 && \*it == \_XtsTy::constval::br\_tag)
{
\_Tmp += \_XtsTy::constval::crlf;
continue;
}
if (it->length() > 1 && it->at(0) == '<' && it->at(1) != '/')
\_Tmp += \_XtsTy::constval::crlf;
\_Tmp += it->c\_str();
}
return \_Tmp;
}
private:
friend class xdocument\_t<\_XtsTy>;
\_Nodetype \*\_Node;
};
template<typename \_XtsTy>
class xdocument\_t {
public:
xdocument\_t() {
nodepath\_array.reserve(0x10);
}
~xdocument\_t() {
res.clear();
}
using \_StringTy = typename \_XtsTy::strtype;
using \_ParserTy = xparser\_t<\_XtsTy>;
using Basetype = typename \_XtsTy::Basetype;
using \_ResourceTy = xresource\_t<\_StringTy>;
using \_TagIndexTy = typename \_ResourceTy::xtagindex\_t;
using element = xelement\_t<\_XtsTy>;
using \_Nodetype = typename element::\_Nodetype;
int load\_file(const \_StringTy &\_Filename) {
errp.pos = 0;
errp.line = 0;
errp.column = 0;
res.clear();
std::ifstream fs(\_Filename.c\_str(), std::ios::binary);
fs.seekg(0, std::ios::end);
size\_t s = (size\_t)fs.tellg();
fs.seekg(0, std::ios::beg);
if (!s) {
errp.information.reserve(\_Filename.length() \* 3);
sprintf((char\*)errp.information.data(),
(sizeof(Basetype) != 1) ? "%ls" : "%s", \_Filename.c\_str());
errp.number = 29;
errp.pos = 0;
return -1;
}
char \*p = new char\[s + 2\];
p\[s\] = 0; p\[s + 1\] = 0;
fs.read(p, s);
fs.close();
size\_t \_Off = 0;
//预测文档编码,并不一定准确,只能说想到的判断都做了。
/\*返回值有4种:
0 多字节编码非utf-8
1 utf-16
2 utf-8
-1 错误
\*/
#if defined(_WIN32) || defined(_WIN64)
_SrcEncode = encode_adaptive::xmlec_predict(p, s, &(errp.number), &_Off);
if (_SrcEncode < 0) {
delete p;
errp.information.reserve(_Filename.length() * 3);
sprintf((char*)errp.information.data(),
(sizeof(Basetype) != 1) ? "%ls" : "%s", _Filename.c_str());
return -1;
}
//很遗憾的事情是,c++17删除了编码转换库,所以,只能使用操作系统的函数来完成了。
//虽然这个类库并不依赖c++17,但为了以后和新标准对接,所以只能自己实现跨平台的转换策略。
//另外一点是,linux其实对转码没有什么需求。
\_StringTy \_Text;
if (encode\_adaptive::specifiy(p + \_Off, \_SrcEncode, \_XtsTy::\_encoding, \_Text) == \_nf) {
delete p;
errp.information.reserve(\_Filename.length() \* 3);
sprintf((char\*)errp.information.data(),
(sizeof(Basetype) != 1) ? "%ls" : "%s", \_Filename.c\_str());
errp.number = 29;
return -1;
}
delete p;
\_ParserTy xp;
int \_Result = xp.load(\_Text.c\_str(), (xml\_size\_t)s, &res);
#else
_ParserTy xp;
int _Result = xp.load(p, (xml_size_t)s, &res);
delete p;
#endif
res.root.inner.end = res.docs.end();
xp.get_errp(errp);
if (errp.number) xp.get_err_pos(errp);
return _Result;
}
element get\_element(const \_StringTy &\_TagName) {
size\_t \_Off = 0;
size\_t \_Pos;
Basetype \*\_Ptr = (Basetype \*)\_TagName.c\_str();
nodepath\_array.clear();
auto i = res.tags.end();
for (;;) {
\_Pos = \_TagName.find('/', \_Off);
if (\_Pos == \_nf)
break;
\_Ptr\[\_Pos\] = 0;
i = res.tags.find(\_Ptr + \_Off);
\_Ptr\[\_Pos\] = '/';
if (i == res.tags.end()) return nullptr;
nodepath\_array.push\_back(&(i->second));
\_Off = \_Pos + 1;
}
i = res.tags.find(\_Ptr + \_Off);
if (i == res.tags.end()) return nullptr;
if (!nodepath\_array.size()) return \*(i->second.begin());
nodepath\_array.push\_back(&(i->second));
return recursive\_nodepath(nullptr, 0);
}
element get\_element(element &\_Parent, const \_StringTy &\_TagName) {
auto fit = res.tags.find(\_TagName);
if (fit != res.tags.end()) {
for (auto it = fit->second.begin(); it != fit->second.end(); ++it) {
if (\_Parent->\_Node == it->\_Node)
return it;
}
}
return nullptr;
}
std::string get\_error\_info() {
char buf\[256\];
std::string \_Result;
if (errp.pos != 0) {
sprintf(buf, "XML错误位于 行(%d), 列(%d):", errp.line, errp.column);
\_Result += buf;
}
sprintf(buf, xml\_error\_information\[errp.number\], errp.information.c\_str());
\_Result += buf;
return \_Result;
}
element root() {
return &(res.root);
}
element end() {
return nullptr;
}
private:
\_Nodetype \*recursive\_nodepath(\_Nodetype \*\_Parent, size\_t i) {
\_TagIndexTy \*pti = nodepath\_array\[i\];
auto \_next = i + 1;
if (\_next == nodepath\_array.size())
{
for (auto it = pti->begin(); it != pti->end(); ++it) {
if (!i || (\*it)->parent == \_Parent)
return \*it;
}
}
else
{
for (auto it = pti->begin(); it != pti->end(); ++it) {
if (!i || (\*it)->parent == \_Parent) {
\_Nodetype \*p = recursive\_nodepath(\*it, \_next);
if (p) return p;
}
}
}
return (\_Nodetype \*)nullptr;
}
private:
\_ResourceTy res;
xerrorpos errp;
int \_SrcEncode;
std::vector<\_TagIndexTy\*> nodepath\_array;
};
}
#if defined(_WIN32) || defined(_WIN64)
template
using xdoc = aqx_internal::xdocument_t<_Ty>;
using xts_utf8 = aqx_internal::xts_utf8;
using xts_utf16 = aqx_internal::xts_utf16;
using xts_asc = aqx_internal::xts_asc;
#else
using xdoc = aqx_internal::xdocument_t
#endif
}
#pragma warning(pop)
//encode_adaptive.h - windows only
#pragma once
#include
#include "tcvt.h"
#ifndef _nf
#define _nf ((size_t)-1)
#endif
namespace aqx {
namespace encode\_adaptive {
static constexpr auto unknow{ static\_cast<int>(-1) };
static constexpr auto sys{ static\_cast<int>(0) };
static constexpr auto utf16{ static\_cast<int>(1) };
static constexpr auto utf8{ static\_cast<int>(2) };
static int profile\_predict(unsigned char \*\_Text, size\_t \_Size, int &\_Off, int \_Def = 0) {
if (\_Size >= 3) {
if (\_Text\[0\] == 0xEF &&
\_Text\[1\] == 0xBB &&
\_Text\[2\] == 0xBF) {
\_Off = 3;
return 2;
}
}
if (\_Size >= 2) {
if (\_Text\[0\] == 0xFF && \_Text\[1\] == 0xFE) {
\_Off = 2;
return 1;
}
}
\_Off = 0;
size\_t s = \_Size;
if (s > 0x10)
s = 0x10;
int x = 0;
for (size\_t i = 0; i < s; i++) {
if (\_Text\[i\] == 0)
x++;
}
if (\_Size == s && x == 1)
return \_Def;
if (!x)
return \_Def;
return 1;
}
template<typename \_Ty>
static int profile\_adaptive(char \*\_Text, size\_t \_Size, \_Ty &\_Result, int \_Def = 0) {
int \_StartOff = 0;
int \_SrcCode = encode\_adaptive::profile\_predict((unsigned char\*)\_Text, \_Size, \_StartOff, \_Def);
size\_t \_TargetCode = 0;
if (sizeof(decltype(\*\_Result.c\_str())) == 2)
\_TargetCode = 1;
std::wstring \_utf16;
if (\_SrcCode == 2)
aqx::utf16\_from\_utf8(\_utf16, \_Text + \_StartOff);
else if (\_SrcCode == 1)
\_utf16 = (wchar\_t\*)(\_Text + \_StartOff);
else
aqx::utf16\_from\_asc(\_utf16, \_Text + \_StartOff);
auto \_proc0 = \[\](void \*\_Res, std::wstring &\_wstr) { asc\_from\_utf16(\*(std::string\*)\_Res, \_wstr); };
auto \_proc1 = \[\](void \*\_Res, std::wstring &\_wstr) { \*(std::wstring\*)(\_Res) = \_wstr; };
auto \_proc2 = \[\](void \*\_Res, std::wstring &\_wstr) { aqx::utf8\_from\_utf16(\*(std::string\*)(\_Res), \_wstr); };
if (\_TargetCode == 0)
\_proc0(&\_Result, \_utf16);
else
\_proc1(&\_Result, \_utf16);
return \_SrcCode;
}
template<typename \_Ty>
static size\_t specifiy(char \*\_Text, int \_Srcec, int \_Targetec, \_Ty &\_Result) {
if (sizeof(\_Ty::\_Mybase::\_Alty::value\_type) == 1 && \_Targetec == 1)
return \_nf;
if (sizeof(\_Ty::\_Mybase::\_Alty::value\_type) == 2 && \_Targetec != 1)
return \_nf;
if (\_Srcec == 2) {
if (\_Targetec == 2)
{
\*(std::string\*)&\_Result = (\_Text);
return \_Result.length();
}
else if (\_Targetec == 1)
return utf16\_from\_utf8(\*(std::wstring\*)&\_Result, \_Text);
else
return asc\_from\_utf8(\*(std::string\*)&\_Result, \_Text);
}
else if (\_Srcec == 1)
{
if (\_Targetec == 2)
return utf8\_from\_utf16(\*(std::string\*)&\_Result, (wchar\_t\*)\_Text);
else if (\_Targetec == 1) {
\*(std::wstring\*)&\_Result = (wchar\_t\*)(\_Text);
return \_Result.length();
}
else
return asc\_from\_utf16(\*(std::string\*)&\_Result, (wchar\_t\*)\_Text);
}
else
{
if (\_Targetec == 2)
return utf8\_from\_asc(\*(std::string\*)&\_Result, \_Text);
else if (\_Targetec == 1)
return utf16\_from\_asc(\*(std::wstring\*)&\_Result, \_Text);
else {
\*(std::string\*)&\_Result = (\_Text);
return \_Result.length();
}
}
return \_nf;
}
static void unknow\_append(void \*\_Res, std::string \_Str) { \*(std::string\*)(\_Res) += \_Str; }
static void unknow\_wappend(void \*\_Res, std::wstring \_Str) { \*(std::wstring\*)(\_Res) += \_Str; }
static int xmlec\_nbom\_wchar(wchar\_t \*\_Text, size\_t \_Size) {
if (\_Size < 7) return -1;//小于7字节的xml文档是不成立的
auto p = wcschr(\_Text, L'<');
if (!p) return -1;
if (p\[1\] == L'?') {
if (p != \_Text) return -3;//xml声明没有位于xml文件头部
p = wcsstr(\_Text + 2, L"?>");
if (!p) return -4;//没有找到xml声明结尾
}
return 1;
}
static int xmlec\_nbom\_char(char \*\_Text, size\_t \_Size) {
auto p = strchr(\_Text, '<');
if (!p) return -1;
if (!p\[1\]) //找到第一个<,如果他它之后一个字符是0,则考虑它是不是utf16
{
if (p - \_Text == \_Size - 1) return -2;//如果它已经是字符串最后一个有效字符,直接报错。
if (\_Size % 2) return -2; //长度不是偶数,说明绝对不可能是utf16
return xmlec\_nbom\_wchar((wchar\_t\*)\_Text, (\_Size >> 1));
}
if (p\[1\] == '?') {
if (p != \_Text) return -3;//xml声明没有位于xml文件头部
p = strstr(\_Text + 2, "?>");
if (!p) return -4;//没有找到xml声明结尾
auto s = (p - \_Text) + 2;
std::string str(\_Text, p - \_Text + 2);
std::transform(str.begin(), str.end(), str.begin(), toupper);
if (str.find("UTF-8") != \_nf) return 2;
if (str.find("GBK") != \_nf) return 0;
if (str.find("GB2312") != \_nf) return 0;
}
return 2;
}
static int xmlec\_predict(char \*\_Text, size\_t \_Size, int \*err\_number, size\_t \*\_Off = NULL, int \_Default = 2) {
\*err\_number = 0;
if (\_Size < 7) {
//小于7字节的xml文档是不成立的
\*err\_number = 29;
return -1;
}
//先基于bom判断
if ((unsigned char)(\_Text\[0\]) == 0xEF && (unsigned char)(\_Text\[1\]) == 0xBB && (unsigned char)(\_Text\[2\]) == 0xBF) {
if (\_Off) \*\_Off = 3;
auto p = strchr(\_Text + 3, '<');
if (!p) {
\*err\_number = 29;
return -1;
}
if (p\[1\] == '?')
{
if (p != \_Text + 3) {
\*err\_number = 31;
return -1;
}
p = strstr(\_Text + 5, "?>");
if (!p) {
\*err\_number = 32;
return -1;
}
}
return 2;
}
else if ((unsigned char)(\_Text\[0\]) == 0xFF && (unsigned char)(\_Text)\[1\] == 0xFE) {
if (\_Off) \*\_Off = 2;
auto p = wcschr((wchar\_t\*)\_Text + 1, L'<');
if (!p) {
\*err\_number = 29;
return -1;
}
if (p\[1\] == L'?')
{
if (p != (wchar\_t\*)\_Text + 1) {
\*err\_number = 31;
return -1;
}
p = wcsstr((wchar\_t\*)\_Text + 3, L"?>");
if (!p) {
\*err\_number = 32;
return -1;
}
}
return 1;
}
if (\_Off) \*\_Off = 0;
int n = xmlec\_nbom\_char(\_Text, \_Size);
if (n < -1) {
if (n == -2)
\*err\_number = 29;
else if (n == -3)
\*err\_number = 31;
else if (n == -4)
\*err\_number = 32;
return -1;
}
else if (n >= 0) return n;
if (!(\_Size % 2))
n = xmlec\_nbom\_wchar((wchar\_t\*)\_Text, (\_Size >> 1));
if (n < -1) {
if (n == -2)
\*err\_number = 29;
else if (n == -3)
\*err\_number = 31;
else if (n == -4)
\*err\_number = 32;
return -1;
}
return \_Default;
}
};
}
//tcvt.h - windows only
#pragma once
#if defined(_WIN32) || defined(_WIN64)
#ifndef _WINDOWS_
#include
#endif
#endif
namespace aqx {
static size\_t \_mbs2wcs(int \_Cp, const std::string &\_Mbs, std::wstring &\_Wcs) {
int n = MultiByteToWideChar(\_Cp, 0, \_Mbs.c\_str(), (int)\_Mbs.length(), nullptr, 0);
\_Wcs.resize(n);
return MultiByteToWideChar(\_Cp, 0, \_Mbs.c\_str(), (int)\_Mbs.length(), (wchar\_t\*)\_Wcs.data(), (int)\_Wcs.capacity());
}
static size\_t \_wcs2mbs(int \_Cp, const std::wstring &\_Wcs, std::string &\_Mbs) {
int n = WideCharToMultiByte(\_Cp, 0, \_Wcs.c\_str(), (int)\_Wcs.length(), nullptr, 0, NULL, FALSE);
\_Mbs.resize(n);
return WideCharToMultiByte(\_Cp, 0, \_Wcs.c\_str(), (int)\_Wcs.length(), (char\*)\_Mbs.data(), (int)\_Mbs.capacity(), NULL, FALSE);
}
static size\_t utf8\_from\_asc(std::string &\_Result, const std::string &\_Asc) {
std::wstring \_Tmp;
\_mbs2wcs(CP\_ACP, \_Asc, \_Tmp);
return \_wcs2mbs(CP\_UTF8, \_Tmp, \_Result);
}
static size\_t utf16\_from\_asc(std::wstring &\_Result, const std::string &\_Asc) {
return \_mbs2wcs(CP\_ACP, \_Asc, \_Result);
}
static size\_t asc\_from\_utf8(std::string &\_Result, const std::string &\_U8s) {
std::wstring \_Tmp;
\_mbs2wcs(CP\_UTF8, \_U8s, \_Tmp);
return \_wcs2mbs(CP\_ACP, \_Tmp, \_Result);
}
static size\_t utf16\_from\_utf8(std::wstring &\_Result, const std::string &\_U8s) {
return \_mbs2wcs(CP\_UTF8, \_U8s, \_Result);
}
static size\_t utf8\_from\_utf16(std::string &\_Result, const std::wstring &\_Wcs) {
return \_wcs2mbs(CP\_UTF8, \_Wcs, \_Result);
}
static size\_t asc\_from\_utf16(std::string &\_Result, const std::wstring &\_Wcs) {
return \_wcs2mbs(CP\_ACP, \_Wcs, \_Result);
}
}
测试代码:
#include "pch.h"
#include
#include "xml.hpp"
#include
int main()
{
setlocale(LC\_ALL, "");
// 支持三种编码格式:aqx::xts\_utf16 aqx::xts\_utf8 aqx::xts\_asc
aqx::xdoc<aqx::xts\_utf16> doc;
auto t = clock();
int err = doc.load\_file(L"G:\\\\vs2017\\\\test\\\\生成\\\\test.xml");
printf("解析文档耗时:%d ms\\n", clock() - t);
if (err) {
printf("%s\\n", doc.get\_error\_info().c\_str());
return 0;
}
auto e = doc.get\_element(L"CATALOG2");
printf("%ls\\n", e.get\_inner\_xml().c\_str());
system("pause");
return 0;
}
手机扫一扫
移动阅读更方便
你可能感兴趣的文章