非标准的xml解析器的C++实现:三、解析器的初步实现
阅读原文时间:2022年06月30日阅读:3

如同我之前的一篇文章说的那样,我没有支持DTD与命名空间,

当前实现出来的解析器,只能与xmlhttp对比,因为chrome浏览器解析大文档有bug,至于其他人实现的,我就不一一测试了,既然都决定自己实现了,我只选择大公司的代码做对比。

测试文档大小:3M bytes,约90000个节点。

aqx::xdoc :耗时 70-80ms,内存占用30-40M bytes,30和40主要是32位和64位的区别,如果要追求最少的内存占用,还可以更极端一些,解析速度很难再有本质的提升了,后续要完善的支持,也不会影响解析速度。

xmlhttp: 耗时3000-4000 ms,内存占用约 800M bytes。

目前测试过的系统有:

windows vc++ (vs2017) x86 x64

linux centos7 g++ version(9.1.1) x86 x64

windows中支持3种编码格式:utf-8  utf-16 对应操作系统的ascii编码,在简体中文windows中,也就是通常我们说的gb2312了。

然后目前,我不太可能针对实现细节将原理讲清楚,讲真的,C++的可读性真的非常糟糕,但对于这种需求,还是得用它,这种代码,我自己写完看着不难受,但对别人来说很可能是噩梦,同样的道理,我看别人的C++代码,也会困惑,要让看不懂代码的人,也理解实现细节,这是非常不科学的事。。。

好了,废话不说了,上代码:

//xml.hpp
#pragma once

#include
#include
#include
#include
#include
#include
#include
#include
#if defined(_WIN32) || defined(_WIN64)
//我只支持了windows中的编码转换,所以这两个文件,仅在windows下使用。
#include "tcvt.h"
#include "encode_adaptive.h"
#endif

#pragma warning(push)
#pragma warning(disable:4996)

namespace aqx {

namespace aqx\_internal {

#ifndef __AQX_UTF8_CHAR_LEN
#define __AQX_UTF8_CHAR_LEN
static unsigned char utf8_char_len[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};
#endif

    //单字节的字符状态值,对应语法常量  
    static unsigned short xml\_char\_syntax\[\] = {  
        0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,  
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  
        8,64,4,0,0,0,0,4,0,0,0,0,0,8208,16,128,  
        1552,1552,1552,1552,1552,1552,1552,1552,1552,1552,0,256,1,2048,2,0,  
        0,1072,1072,1072,1072,1072,1072,48,48,48,48,48,48,48,48,48,  
        48,48,48,48,48,48,48,48,48,48,48,4096,0,0,0,48,  
        0,1072,1072,1072,1072,1072,1072,48,48,48,48,48,48,48,48,48,  
        48,48,48,48,48,48,48,48,48,48,48,0,0,0,0,0,  
    };

    namespace XML\_SYNTAX {  
        //语法常量定义,渣英语,名称定义凑合看吧。

        static constexpr auto \_X\_LT{ static\_cast<unsigned short>(0x01) };            // <  
        static constexpr auto \_X\_GT{ static\_cast<unsigned short>(0x02) };            // >  
        static constexpr auto \_X\_STRING{ static\_cast<unsigned short>(0x04) };        // ' "  
        static constexpr auto \_X\_SPACE{ static\_cast<unsigned short>(0x08) };        // \\r\\n\\r空格  
        static constexpr auto \_X\_NAME{ static\_cast<unsigned short>(0x10) };        // A-Z a-z 0-9 \_ - .  
        static constexpr auto \_X\_BEGINNAME{ static\_cast<unsigned short>(0x20) };    // A-Z a-z \_  
        static constexpr auto \_X\_EXCLAM{ static\_cast<unsigned short>(0x40) };        // !  
        static constexpr auto \_X\_TAGEND{ static\_cast<unsigned short>(0x80) };        // /  
        static constexpr auto \_X\_ESCAPEEND{ static\_cast<unsigned short>(0x100) };    // ;  
        static constexpr auto \_X\_NUMBER{ static\_cast<unsigned short>(0x200) };    // 数字0-9  
        static constexpr auto \_X\_HEX{ static\_cast<unsigned short>(0x400) };        // 16进制0-9 A-F a-f  
        static constexpr auto \_X\_EQUAL{ static\_cast<unsigned short>(0x800) };        // =  
        static constexpr auto \_X\_LB{ static\_cast<unsigned short>(0x1000) };        // \[  
        static constexpr auto \_X\_NEGATIVE{ static\_cast<unsigned short>(0x2000) };    // -  
        static constexpr auto \_X\_MULTIBYTE{ static\_cast<unsigned short>(0x4000) };    // 多字节字符  
    }

    //保险起见,为了未来考虑,定义一下xml文档的最大长度,时代发展太迅猛,万一我有生之年能用上128bit,到时候也许处理64bit长度的文档就跟我们现在解析小文档一样。  
    using xml\_size\_t = unsigned int;  
    static constexpr auto \_xnf{ static\_cast<xml\_size\_t>(-1) };

    //这个结构,用来储存转义符位置,以便于快速替换,备用,暂不实现,因为这关乎性能。  
    struct xml\_escape\_pos { xml\_size\_t pos, len; };

    template<typename \_XtsTy>  
    class xparser\_t;

    //xml文本迭代器的基本模板类  
    template<typename \_Ty>  
    class xts\_t {  
    public:  
        using Basetype = \_Ty;

    protected:  
        const \_Ty \*text;  
        xml\_size\_t size;  
        xml\_size\_t index;  
        \_Ty c;  
        unsigned char cl;  
        unsigned short s;  
        unsigned short flags;  
    };

    //解析错误信息结构  
    //解析时不处理行,列问题,有错误发生时后处理,因为,行,列处理,会使解析速度慢差不多一倍。  
    struct xerrorpos {  
        xml\_size\_t pos;  
        int number;  
        std::string information;  
        xml\_size\_t line;  
        xml\_size\_t column;  
    };

    //这两个结构用来储存一些字符串常量,实现两种字符串格式的快速引用,这两个结构绑定到三种xts类中  
    struct xmultybyte\_constvalue {  
        static constexpr const char \*emp = "";  
        static constexpr const char \*br\_tag = "<br/>";  
        static constexpr const char \*crlf = "\\r\\n";  
        static constexpr const char \*end\_tag\_syntax = "</";  
        static constexpr const char \*autoend\_tag\_syntax = "/>";  
        static constexpr const char \*comment\_end = "--";  
        static constexpr const char \*cdata\_end = "\]\]>";  
    };

    struct xwidechar\_constvalue {  
        static constexpr const wchar\_t \*emp = L"";  
        static constexpr const wchar\_t \*br\_tag = L"<br/>";  
        static constexpr const wchar\_t \*crlf = L"\\r\\n";  
        static constexpr const wchar\_t \*end\_tag\_syntax = L"</";  
        static constexpr const wchar\_t \*autoend\_tag\_syntax = L"/>";  
        static constexpr const wchar\_t \*comment\_end = L"--";  
        static constexpr const wchar\_t \*cdata\_end = L"\]\]>";  
    };

    //utf8的文本迭代器,先基于这个来实现  
    class xts\_utf8 : public xts\_t<char>  
    {  
    public:  
        using strtype = std::string;  
        static constexpr int \_encoding{ 2 };  
        using constval = xmultybyte\_constvalue;

        //初始化  
        void init(const char \*\_Text, xml\_size\_t \_Size) {  
            text = \_Text;  
            size = \_Size;  
            index = 0;  
            c = text\[0\];  
            cl = utf8\_char\_len\[(unsigned char)c\];  
            s = (cl != 1) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        //处理下一个字符  
        void next() {  
            index += cl;  
            c = text\[index\];  
            cl = utf8\_char\_len\[(unsigned char)c\];  
            s = (cl != 1) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        //向前回退n个字符,目前,只有在根节点之前的处理,有用到这个  
        void back(xml\_size\_t len) {  
            index -= len;  
            c = text\[index\];  
            cl = utf8\_char\_len\[(unsigned char)c\];  
            s = (cl != 1) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        //next,并判断语法  
        bool next\_is\_flags() {  
            next();  
            return (flags & s) != 0;  
        }

        //next, 并判断下一个字符的值  
        bool next\_is\_char(char \_Chr) {  
            next();  
            return \_Chr == c;  
        }

        //解析错误时,用于获取行,列。  
        void next\_donot\_syntax() {  
            index += cl;  
            c = text\[index\];  
            cl = utf8\_char\_len\[(unsigned char)c\];  
        }

        //设置允许的语法  
        void set\_flags(unsigned short \_Flags) {  
            flags = \_Flags;  
        }

    private:  
        friend class xparser\_t<xts\_utf8>;  
    };

    //asc的文本迭代器  
    class xts\_asc : public xts\_t<char>  
    {  
    public:  
        using strtype = std::string;  
        static constexpr int \_encoding{ 0 };  
        using constval = xmultybyte\_constvalue;

        //初始化  
        void init(const char \*\_Text, xml\_size\_t \_Size) {  
            text = \_Text;  
            size = \_Size;  
            index = 0;  
            c = text\[0\];  
            cl = ((unsigned short)c >= 0x80) ? 2 : 1;  
            s = (cl != 1) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        //处理下一个字符  
        void next() {  
            index += cl;  
            c = text\[index\];  
            cl = ((unsigned short)c >= 0x80) ? 2 : 1;  
            s = (cl != 1) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        //向前回退n个字符,目前,只有在根节点之前的处理,有用到这个  
        void back(xml\_size\_t len) {  
            index -= len;  
            c = text\[index\];  
            cl = ((unsigned short)c >= 0x80) ? 2 : 1;  
            s = (cl != 1) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        //next,并判断语法  
        bool next\_is\_flags() {  
            next();  
            return (flags & s) != 0;  
        }

        //next, 并判断下一个字符的值  
        bool next\_is\_char(char \_Chr) {  
            next();  
            return \_Chr == c;  
        }

        void next\_donot\_syntax() {  
            index += cl;  
            c = text\[index\];  
            cl = ((unsigned short)c >= 0x80) ? 2 : 1;  
        }

        //设置允许的语法  
        void set\_flags(unsigned short \_Flags) {  
            flags = \_Flags;  
        }

    private:  
        friend class xparser\_t<xts\_asc>;  
    };

    class xts\_utf16 : public xts\_t<wchar\_t> {  
    public:  
        using strtype = std::wstring;  
        static constexpr int \_encoding{ 1 };  
        using constval = xwidechar\_constvalue;  
        xts\_utf16() {  
            //utf16的字符不是变长的,固定为1  
            //虽然有4字节的utf16字符,但影响不到最终解析逻辑。  
            cl = 1;  
        }

    private:  
        void init(const wchar\_t \*\_Text, xml\_size\_t \_Size) {  
            text = \_Text;  
            size = \_Size;  
            index = 0;  
            c = text\[0\];  
            cl = 1;  
            s = ((unsigned short)c >= 0x80) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        //处理下一个字符  
        void next() {  
            c = text\[++index\];  
            s = ((unsigned short)c >= 0x80) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        void back(xml\_size\_t len) {  
            index -= len;  
            c = text\[index\];  
            s = ((unsigned short)c >= 0x80) ?  
                XML\_SYNTAX::\_X\_MULTIBYTE | XML\_SYNTAX::\_X\_BEGINNAME | XML\_SYNTAX::\_X\_NAME :  
                xml\_char\_syntax\[(unsigned char)c\];  
        }

        //next,并判断语法  
        bool next\_is\_flags() {  
            next();  
            return (flags & s) != 0;  
        }

        //next, 并判断下一个字符的值  
        bool next\_is\_char(char \_Chr) {  
            next();  
            return \_Chr == c;  
        }

        void next\_donot\_syntax() {  
            c = text\[++index\];  
        }

        //设置允许的语法  
        void set\_flags(unsigned short \_Flags) {  
            flags = \_Flags;  
        }  
    private:  
        friend class xparser\_t<xts\_utf16>;  
    };

    template<typename \_XtsTy>  
    class xdocument\_t;

    template<typename \_Ty>  
    class xelement\_t;

    template<typename \_Ty>  
    class xresource\_t {  
    public:

        //xml节点数据结构,因为结构相互依赖的原因,所以嵌套在一起  
        using Basetype = typename \_Ty::value\_type;  
        class xnode;  
        using xtagindex\_t = std::list<xnode\*>;  
        using xtagindex\_ref = typename xtagindex\_t::iterator;  
        using xdoctext\_t = std::list<\_Ty>;

        using xattrname\_t = std::set<\_Ty>;  
        using xattrvalue\_t = std::map<\_Ty, xml\_size\_t>;  
        using xtagtext\_t = std::map<\_Ty, xtagindex\_t>;  
        using xdoctext\_ref = typename xdoctext\_t::iterator;  
        using xtagtext\_ref = typename xtagtext\_t::iterator;  
        using xattrname\_ref = typename xattrname\_t::iterator;  
        using xattrvalue\_ref = typename xattrvalue\_t::iterator;

        class xnode {  
        public:  
            using \_Self\_Reftype = typename std::list<xnode>::iterator;  
            xnode() {  
                parent = nullptr;  
            }

            xnode(xnode \*\_Parent, xresource\_t \*\_Resource) {  
                parent = \_Parent;  
                ti.doc\_body\_ref = inner.end = inner.begin = \_Resource->docs.end();  
            }

        private:  
            void refactor\_tag\_body(int \_Style, xml\_size\_t \_PreSize, xresource\_t \*\_Resource)  
            {  
                \_Ty &\_Tmp = \_Resource->refactor\_buffer;  
                \_Tmp.clear();  
                \_Tmp.reserve(\_PreSize);  
                \_Tmp += (Basetype)'<';  
                \_Tmp += ti.name->first;

                for (auto it = attrs.begin(); it != attrs.end(); ++it) {  
                    \_Tmp += (Basetype)' ';  
                    \_Tmp += \*it->name;  
                    \_Tmp += (Basetype)'=';  
                    \_Tmp += (Basetype)it->st;  
                    \_Tmp += it->value->first;  
                    \_Tmp += (Basetype)it->st;  
                }  
                if (\_Style == 2)  
                    \_Tmp += (Basetype)'/';  
                \_Tmp += (Basetype)'>';  
                if (ti.doc\_body\_ref == \_Resource->docs.end()) {  
                    \_Resource->docs.push\_back(\_Tmp);  
                    ti.doc\_body\_ref = --(\_Resource->docs.end());  
                    parent->inner.end = ti.doc\_body\_ref;  
                    if (parent->inner.begin == \_Resource->docs.end())  
                        parent->inner.begin = ti.doc\_body\_ref;  
                }  
                else  
                {

                    \*ti.doc\_body\_ref = \_Tmp;  
                }  
            }

        private:  
            friend class xresource\_t;  
            friend class xparser\_t<xts\_utf8>;  
            friend class xparser\_t<xts\_utf16>;  
            friend class xparser\_t<xts\_asc>;  
            friend class xdocument\_t<xts\_utf8>;  
            friend class xdocument\_t<xts\_utf16>;  
            friend class xdocument\_t<xts\_asc>;  
            friend class xelement\_t<xts\_utf8>;  
            friend class xelement\_t<xts\_utf16>;  
            friend class xelement\_t<xts\_asc>;  
            struct xattr { xattrname\_ref name; xattrvalue\_ref value; char st; };

            struct tag\_info {  
                xtagtext\_ref name;//标签名称  
                xtagindex\_ref name\_index\_ref;//在标签名称索引中的引用,本质上其实就是个指针  
                xdoctext\_ref doc\_body\_ref;//整个标签信息,包含属性 <t ...> 在文档中的实体  
            }ti;

            std::list<xattr> attrs;  
            struct xinner { xdoctext\_ref begin, end; }inner;  
            std::list<xnode> child;  
            xnode \*parent;  
            \_Self\_Reftype self;  
        };

        xresource\_t() {  
            //预定义的几个转义符实体:lt gt amp quot apos  
            escape\_bodys\[{ (Basetype)'l', (Basetype)'t'}\] = { (Basetype)'<' };  
            escape\_bodys\[{ (Basetype)'g', (Basetype)'t' }\] = { (Basetype)'>' };  
            escape\_bodys\[{ (Basetype)'a', (Basetype)'m', (Basetype)'p' }\] = { (Basetype)'&' };  
            escape\_bodys\[{ (Basetype)'q', (Basetype)'u', (Basetype)'o', (Basetype)'t' }\] = { (Basetype)'"' };  
            escape\_bodys\[{ (Basetype)'a', (Basetype)'p', (Basetype)'o', (Basetype)'s' }\] = { (Basetype)'\\'' };  
        }

        void clear() {  
            root.child.clear();  
            docs.clear();  
            tags.clear();  
            attr\_names.clear();  
            attr\_values.clear();  
            root.parent = nullptr;  
            root.ti.doc\_body\_ref = root.inner.end = root.inner.begin = docs.end();  
        }

        xnode root;  
        xdoctext\_t docs;  
        xtagtext\_t tags;  
        xattrname\_t attr\_names;  
        xattrvalue\_t attr\_values;  
        std::map<\_Ty, \_Ty> escape\_bodys;  
        \_Ty refactor\_buffer;  
    };

    template<typename \_XtsTy>  
    class xparser\_t {  
    public:  
        using \_StringTy = typename \_XtsTy::strtype;  
        using Basetype = typename \_XtsTy::Basetype;  
        xparser\_t() {  
            // 这里忽悠一下编译器,自动根据类型选择:strstr 或 wcsstr  
            typedef const char \*(\*STRSTRFUNC)(const char \*, const char \*);  
            typedef const wchar\_t \*(\*WSTRSTRFUNC)(const wchar\_t \*, const wchar\_t \*);  
            typedef const Basetype \*(\*MYSTRSTRFUNC)(const void \*, const void \*);  
            \_\_multiec\_strstr = ((sizeof(Basetype) == 1) ?  
                ((MYSTRSTRFUNC)((STRSTRFUNC)strstr)) :  
                ((MYSTRSTRFUNC)((WSTRSTRFUNC)wcsstr)));  
        }

    private:

        void x\_escape\_number()  
        {  
            //数值类型的unicode字符转义处理  
            //这里我是自己实现的字符串转换数字,  
            //因为使用C标准转换需要额外拷贝一次 & 到 ; 字符串,为了避免这个拷贝,就要临时改变转义符结束符 ; 的位置为0来给strtol去计算  
            //而在之后的dom类的load\_string设计中,很可能会直接允许static const char \*xxx= "...";这样的东西传入到这里进行解析。  
            //在windows中,数据段的静态常数成员是的内存页面保护是PAGE\_EXECUTE\_READ,不能写操作。  
            //所以我在这里简单实现了字符串 => 数字。  
            xml\_size\_t ebgn = xts.index - 1;  
            long long x = 0;  
            if (!xts.next\_is\_char('x')) {  
                // # 后面如果不是x,就按10进制的规则来处理  
                if(!(xts.s & XML\_SYNTAX::\_X\_NUMBER)) err(xts.index, 22);  
                xts.set\_flags(XML\_SYNTAX::\_X\_NUMBER | XML\_SYNTAX::\_X\_ESCAPEEND);  
                for (;;) {  
                    x = (x \* 10) + (xts.c - '0');  
                    if (!xts.next\_is\_flags()) err(xts.index, 22);  
                    if (xts.c == ';')  
                        break;  
                }  
            }  
            else  
            {  
                // # 后面是x,按16进制处理  
                xts.set\_flags(XML\_SYNTAX::\_X\_HEX);  
                if (!xts.next\_is\_flags()) err(xts.index, 23);  
                xts.set\_flags(XML\_SYNTAX::\_X\_HEX | XML\_SYNTAX::\_X\_ESCAPEEND);

                int i = 0;  
                for (;; i++) {  
                    long long \_Tmp;  
                    switch (xts.c) {  
                    case '0':case '1':case '2':case '3':case '4':case '5':case '6':case '7':case '8':case '9':  
                        \_Tmp = xts.c - '0';  
                        break;  
                    case 'a':case 'b':case 'c':case 'd':case 'e':case 'f':  
                        \_Tmp = xts.c - 'a' + 10;  
                        break;  
                    case 'A':case 'B':case 'C':case 'D':case 'E':case 'F':  
                        \_Tmp = xts.c - 'A' + 10;  
                        break;  
                    }

                    x += (\_Tmp << (i << 2));  
                    if (!xts.next\_is\_flags()) err(xts.index, 23);  
                    if (xts.c == ';')  
                        break;  
                }

                //由于上面的十六进制数字计算顺序是反的,所以要从最高有效位来倒转  
                long long y = 0;  
                for (int k = 0; k <= i; k++)  
                    y += ((x >> (k << 2)) & 0x0F) << ((i - k) << 2);  
                x = y;  
            }

            if (x < 0x20) {  
                switch (x) {  
                case '\\t':case '\\r':case '\\n':  
                    break;  
                default:  
                    err(ebgn, 24);  
                }  
            }  
            else if (x > 0xD800 && x < 0xDFFF)  
                err(ebgn, 25);  
            else if (x > 0x10FFFF)  
                err(ebgn, 26);  
        }

        void x\_escape\_body()  
        {  
            xml\_size\_t nbgn = xts.index;  
            for (;;) {  
                xts.next();  
                if (xts.c == ';') {  
                    break;  
                }  
                else {  
                    if (!(xts.s & XML\_SYNTAX::\_X\_BEGINNAME)) err(xts.index, 19);  
                }  
            }

            \_StringTy &\_Tmp = \_strtmp\[4\];  
            \_Tmp.assign(xts.text + nbgn, xts.index - nbgn);  
            auto it = res->escape\_bodys.find(\_Tmp);  
            if (it == res->escape\_bodys.end()) {  
                errinfobuffer.reserve(\_Tmp.length() \* 3);  
                int n = sprintf((char\*)errinfobuffer.data(),  
                    ((sizeof(Basetype) != 1) ? "%ls" : "%s"),  
                    \_Tmp.c\_str());  
                err(nbgn, 20, errinfobuffer.c\_str());  
            }  
        }

        void x\_escape() {  
            xts.next();  
            if (xts.c == '#') {  
                x\_escape\_number();  
            }  
            else  
            {  
                if (!(xts.s & XML\_SYNTAX::\_X\_BEGINNAME)) err(xts.index, 18);  
                x\_escape\_body();  
            }  
        }

        void x\_cdata() {  
            xml\_size\_t cbgn = xts.index - 2;  
            const char \*pcdata = "CDATA\[";  
            for (int i = 0; i < 6; i++) {  
                if (!xts.next\_is\_char(pcdata\[i\]))  
                    err(xts.index, 16);  
            }

            // CDATA的结束符比注释标签还要省事,直接向后搜索\]\]>  
            const Basetype \*p = \_\_multiec\_strstr(xts.text + xts.index + 1, \_XtsTy::constval::cdata\_end);  
            if (p) {  
                xts.index = (xml\_size\_t)(p - xts.text + 3);  
                res->docs.push\_back(\_StringTy(xts.text + cbgn, xts.index - cbgn));  
                cur->inner.end = --(res->docs.end());  
                if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;  
            }  
            else  
            {  
                err(cbgn, 17);  
            }  
        }

        void x\_comment() {  
            xml\_size\_t cbgn = xts.index - 2;  
            if (!xts.next\_is\_char('-')) err(xts.index, 13);  
            /\*  
                不太清楚为什么xml注释中不允许存在--,我反正照做了。  
                从代码此处看,实际上是可以允许的,就像是CDATA的结束符那样。  
                utf8的情况下,无法双字搜索。  
                utf16的情况下,也无法4字节搜索。  
                例如这种情况:  
                <--a-->,如果双字搜索,从a开始,有一个-就被忽略掉了,如果要判断这个问题,那实际上和单字节搜索一样的性能。  
            \*/

            const Basetype \*p = \_\_multiec\_strstr(xts.text + xts.index + 1, \_XtsTy::constval::comment\_end);  
            if (p) {  
                if (p\[2\] == '>') {  
                    xts.index = (xml\_size\_t)(p - xts.text + 3);  
                    res->docs.push\_back(\_StringTy(xts.text + cbgn, xts.index - cbgn));  
                    cur->inner.end = --(res->docs.end());  
                    if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;  
                }  
                else  
                {  
                    err((xml\_size\_t)(p - xts.text), 15);  
                }  
            }  
            else  
            {  
                err(cbgn, 14);  
            }  
        }

        void x\_specifics\_tag() {  
            //特殊标签,共有两个分支,注释和CDATA,DTD在根节点之前处理,不会进入这里  
            xts.set\_flags(XML\_SYNTAX::\_X\_LB | XML\_SYNTAX::\_X\_NEGATIVE);  
            if (!xts.next\_is\_flags()) err(xts.index, 2);  
            if (xts.c == '-')  
                x\_comment();  
            else  
                x\_cdata();  
        }

        void x\_end\_node() {

            //结束标签处理

            xts.set\_flags(XML\_SYNTAX::\_X\_BEGINNAME);  
            if (!xts.next\_is\_flags()) err(xts.index, 10);

            xml\_size\_t nbgn = xts.index;  
            xml\_size\_t nend;  
            xts.set\_flags(XML\_SYNTAX::\_X\_NAME | XML\_SYNTAX::\_X\_SPACE | XML\_SYNTAX::\_X\_GT);  
            bool \_BackSpace = false;

            for (;;) {  
                if (!xts.next\_is\_flags()) err(xts.index, 10);  
                if (!(xts.s & XML\_SYNTAX::\_X\_NAME)) {  
                    nend = xts.index;  
                    if (xts.s & XML\_SYNTAX::\_X\_SPACE)  
                        \_BackSpace = true;  
                    break;  
                }  
            }

            if (\_BackSpace) {  
                //后面还有空格  
                xts.set\_flags(XML\_SYNTAX::\_X\_SPACE | XML\_SYNTAX::\_X\_GT);  
                for (;;) {  
                    if (!xts.next\_is\_flags()) err(xts.index, 11);  
                    if (xts.c == '>')  
                        break;  
                }  
            }

            \_StringTy &tmp = \_strtmp\[0\];

            tmp.reserve(nend - nbgn + 0x10);  
            tmp.assign(xts.text + nbgn, nend - nbgn);

            if (tmp != cur->ti.name->first) {  
                errinfobuffer.reserve((tmp.length() + cur->ti.name->first.length()) \* 3 + 0x20);  
                int n = sprintf((char\*)errinfobuffer.data(),  
                    ((sizeof(Basetype) != 1) ? "%ls 与 %ls 不一致" : "%s 与 %s 不一致"),  
                    tmp.c\_str(), cur->ti.name->first.c\_str());  
                err(nbgn, 12, errinfobuffer.c\_str());  
            }

            if (cur->inner.begin == res->docs.end()) {  
                //如果这个节点的内容为空,说明,它跟一个自结束的节点没有区别  
                //直接在父节点中将它修改一个自结束节点即可。  
                cur->parent->inner.end->erase(cur->parent->inner.end->length() - 1);  
                cur->parent->inner.end->append(\_XtsTy::constval::autoend\_tag\_syntax);  
            }  
            else  
            {  
                res->docs.push\_back(\_XtsTy::constval::end\_tag\_syntax);  
                auto it = --(res->docs.end());  
                it->append(tmp);  
                (\*it) += (Basetype)'>';  
                cur->inner.end = it;  
            }

            cur = cur->parent;  
        }

        int x\_tag\_name() {

            auto new\_name = \[this\](xml\_size\_t left, xml\_size\_t right) {  
                \_StringTy &tmp = \_strtmp\[0\];  
                tmp.assign(xts.text + left, right - left);

                auto it = res->tags.find(tmp);  
                if (it == res->tags.end())  
                    it = res->tags.insert({ tmp, std::list<\_Nodetype\*>() }).first;  
                it->second.push\_back(cur);  
                cur->ti.name = it;  
                cur->ti.name\_index\_ref = (--(it->second.end()));  
            };

            xts.set\_flags(  
                XML\_SYNTAX::\_X\_NAME |        //符合名称规范的字符  
                XML\_SYNTAX::\_X\_GT |        // >  
                XML\_SYNTAX::\_X\_TAGEND |    // /自结束标签  
                XML\_SYNTAX::\_X\_SPACE        // 空白字符  
            );

            xml\_size\_t name\_begin = xts.index;  
            for (;;) {

                if (!xts.next\_is\_flags()) err(xts.index, 1);

                switch (xts.c) {  
                case '>':  
                    new\_name(name\_begin, xts.index);  
                    return 1;  
                case '/':  
                    if (!xts.next\_is\_char('>')) err(xts.index, 4);  
                    new\_name(name\_begin, xts.index - 1);  
                    return 2;  
                default:  
                    if (xts.s & XML\_SYNTAX::\_X\_SPACE) {  
                        new\_name(name\_begin, xts.index);  
                        return 0;  
                    }  
                    break;  
                }  
            }  
        }

        bool x\_attr\_name(\_StringTy &\_Name) {  
            xts.set\_flags(  
                XML\_SYNTAX::\_X\_NAME |        //符合名称规范的字符  
                XML\_SYNTAX::\_X\_EQUAL |    //等于号  
                XML\_SYNTAX::\_X\_SPACE        // 空白字符  
            );  
            xml\_size\_t name\_begin = xts.index;  
            for (;;) {  
                if (!xts.next\_is\_flags()) err(xts.index, 5);  
                if (xts.s & (XML\_SYNTAX::\_X\_EQUAL | XML\_SYNTAX::\_X\_SPACE)) {  
                    \_Name.assign(xts.text + name\_begin, xts.index - name\_begin);  
                    return xts.c == '=';  
                }  
            }  
            return false;  
        }

        char x\_attr\_value(\_StringTy &\_Value) {  
            xts.set\_flags(  
                XML\_SYNTAX::\_X\_STRING | //字符串 " '  
                XML\_SYNTAX::\_X\_SPACE // 空白字符  
            );

            char \_Style;

            for (;;) {  
                if (!xts.next\_is\_flags()) err(xts.index, 7);  
                if (xts.s & XML\_SYNTAX::\_X\_STRING) {  
                    \_Style = (char)xts.c;  
                    break;  
                }  
            }

            xml\_size\_t value\_begin = xts.index + 1;

            if (\_Style == '"') {  
                for (;;) {  
                    xts.next();  
                    switch (xts.c) {  
                    case 0:  
                        err(xts.index, 8);  
                    case '<':  
                        err(xts.index, 9);  
                    case '&':  
                        //处理转义符  
                        x\_escape();  
                        break;

                    case '"':  
                        //字符串结束  
                        \_Value.assign(xts.text + value\_begin, xts.index - value\_begin);  
                        return \_Style;  
                    default:  
                        break;  
                    }  
                }  
            }  
            else  
            {  
                for (;;) {  
                    xts.next();  
                    switch (xts.c) {  
                    case 0:  
                        err(xts.index, 8);  
                    case '<':  
                        err(xts.index, 9);  
                    case '&':  
                        //处理转义符  
                        x\_escape();  
                        break;

                    case '\\'':  
                        //字符串结束  
                        \_Value.assign(xts.text + value\_begin, xts.index - value\_begin);  
                        return \_Style;  
                    default:  
                        break;  
                    }  
                }

            }

            return \_Style;  
        }

        void x\_attr(xml\_size\_t &\_Presize) {  
            \_StringTy &name = \_strtmp\[0\];  
            \_StringTy &value = \_strtmp\[1\];

            if (!x\_attr\_name(name)) {  
                //x\_attr\_name中没有找到等于号,对应这种: <a x  =...  
                xts.set\_flags(  
                    XML\_SYNTAX::\_X\_EQUAL | //等号  
                    XML\_SYNTAX::\_X\_SPACE // 空白字符  
                );

                for (;;) {  
                    if (!xts.next\_is\_flags()) err(xts.index, 7);  
                    if (xts.c == '=') break;  
                }  
            }

            char \_Style = x\_attr\_value(value);  
            \_Presize += (xml\_size\_t)(name.length() + value.length() + 6);

            auto itn = res->attr\_names.find(name);  
            if (itn == res->attr\_names.end())  
                itn = res->attr\_names.insert(name).first;

            auto itv = res->attr\_values.find(value);  
            if (itv == res->attr\_values.end())  
                itv = res->attr\_values.insert({ value, 1 }).first;

            cur->attrs.push\_back({ itn, itv, \_Style });  
        }

        int x\_preattr(xml\_size\_t &\_Presize) {

            /\*  
                x\_tag\_name里没有找到 > 的情况下,在标签属性解析开始之前,  
                对应下面这几种情况:  
                <a >  
                <a />  
                <a x=...  
            \*/

            xts.set\_flags(  
                XML\_SYNTAX::\_X\_SPACE |        //空白字符  
                XML\_SYNTAX::\_X\_GT |            // >  
                XML\_SYNTAX::\_X\_BEGINNAME |        //名称首字符  
                XML\_SYNTAX::\_X\_TAGEND            // /自结束标签  
            );

            for (;;) {  
                if (!xts.next\_is\_flags()) err(xts.index, 5);

                switch (xts.c) {  
                case '>':  
                    return 1;  
                case '/':  
                    if (!xts.next\_is\_char('>')) err(xts.index, 4);  
                    return 2;  
                default:

                    if (xts.s & XML\_SYNTAX::\_X\_BEGINNAME) {  
                        x\_attr(\_Presize);  
                        xts.set\_flags(  
                            XML\_SYNTAX::\_X\_SPACE |        // 空白字符  
                            XML\_SYNTAX::\_X\_GT |            // >  
                            XML\_SYNTAX::\_X\_BEGINNAME |        // 名称首字符  
                            XML\_SYNTAX::\_X\_TAGEND            // /自结束标签  
                        );  
                    }  
                    break;  
                }  
            }

        }

        void x\_new\_node() {

            cur->child.push\_back(\_Nodetype(cur, res));  
            auto it = (--cur->child.end());  
            cur = &(\*it);  
            cur->self = it;

            int n = x\_tag\_name();

            xml\_size\_t \_PreSize = (xml\_size\_t)(cur->ti.name->first.length() + 3);  
            if (!n) n = x\_preattr(\_PreSize);

            cur->refactor\_tag\_body(n, \_PreSize, res);

            if (n == 2)  
                cur = cur->parent;  
        }

        void x\_tag() {  
            //标签开始后,下一个字符只能是 符合名称规范的第一个字符,感叹号 !,结束标签 /  
            xts.next();  
            switch (xts.c) {  
            case '!':  
                x\_specifics\_tag();  
                break;  
            case '/':  
                x\_end\_node();  
                break;  
            default:  
                if (!(xts.s & XML\_SYNTAX::\_X\_BEGINNAME)) err(xts.index, 1);  
                x\_new\_node();  
                break;  
            }  
        }

        void x\_text() {

            //标签之外的有效文本处理

            xml\_size\_t tbegin = \_xnf;  
            \_StringTy &tmp = \_strtmp\[3\];  
            tmp.clear();

            for (;;) {  
                xts.next();  
                switch (xts.c) {  
                case 0:  
                    return;  
                case '&':  
                    if (tbegin == \_xnf)  
                        tbegin = xts.index;  
                    x\_escape();  
                    break;  
                case '<':  
                    //处理标签之前,先处理有效文本  
                    if (tbegin != \_xnf) {  
                        if (tmp.length()) tmp += ' ';  
                        tmp.append(xts.text + tbegin, xts.index - tbegin);  
                        tbegin = \_xnf;  
                    }

                    if (tmp.length()) {  
                        res->docs.push\_back(tmp);  
                        cur->inner.end = --(res->docs.end());  
                        if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;  
                        tmp.clear();  
                    }

                    x\_tag();  
                    break;  
                default:  
                    if (!(xts.s & XML\_SYNTAX::\_X\_SPACE)) {  
                        if (tbegin == \_xnf)  
                            tbegin = xts.index;  
                    }  
                    else  
                    {  
                        //遇到空白字符时,如果有效文本开始位置已经记录过了,则将这一段有效的东西添加到有效文本结  
                        if (tbegin != \_xnf) {  
                            if (tmp.length()) tmp += ' ';  
                            tmp.append(xts.text + tbegin, xts.index - tbegin);  
                            tbegin = \_xnf;  
                        }  
                    }  
                }  
            }  
        }

        void x\_dtd() {  
            xml\_size\_t pos = xts.index - 1;  
            const char \*p = "OCTYPE";  
            for (int i = 0; i < 6; i++) {  
                if (!xts.next\_is\_char(p\[i\])) err(pos, 2);  
            }

            int n = 1;  
            int \_StrType = 0;  
            for (;;) {  
                xts.next();  
                switch (xts.c) {  
                case '"':  
                case '\\'':  
                    if (!\_StrType)  
                        \_StrType = xts.c;  
                    else if (\_StrType == xts.c)  
                        \_StrType = 0;  
                    break;  
                case '<':  
                    n++;  
                    break;  
                case '>':  
                    if (!\_StrType) {  
                        if (!(--n)) {

                            res->docs.push\_back(\_StringTy(xts.text + pos, xts.index - pos + 1));  
                            cur->inner.end = --(res->docs.end());  
                            if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;  
                            //wprintf(L"%s\\n", cur->inner.end->c\_str());  
                            return;  
                        }  
                    }  
                    break;  
                default:  
                    break;  
                }  
            }  
        }

        void x\_declare() {  
            xml\_size\_t pos = xts.index - 1;  
            int \_StrType = 0;  
            for (;;) {  
                xts.next();  
                switch (xts.c) {  
                case '"':  
                case '\\'':  
                    if (!\_StrType)  
                        \_StrType = xts.c;  
                    else if (\_StrType == xts.c)  
                        \_StrType = 0;  
                    break;  
                case '?':  
                    if (!\_StrType) {  
                        if (!xts.next\_is\_char('>')) err(xts.index, 30);  
                        res->docs.push\_back(\_StringTy(xts.text + pos, xts.index - pos + 1));  
                        cur->inner.end = --(res->docs.end());  
                        if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end;  
                        return;  
                    }  
                    break;  
                default:  
                    break;  
                }  
            }  
        }

        int x\_root() {  
            if (setjmp(\_Rem)) return -1;  
            xts.set\_flags(XML\_SYNTAX::\_X\_LT | XML\_SYNTAX::\_X\_SPACE);  
            bool root\_break = false;  
            for (;;) {  
                if (xts.c == '<') {  
                    xts.next();  
                    switch (xts.c) {  
                    case '!':  
                        xts.next();  
                        if (xts.c == '-')  
                            x\_comment();  
                        else if (xts.c == 'D')  
                            x\_dtd();  
                        else  
                            err(xts.index, 2);  
                        xts.set\_flags(XML\_SYNTAX::\_X\_LT | XML\_SYNTAX::\_X\_SPACE);  
                        break;  
                    case '?':  
                        if (xts.index != 1)  
                            err(xts.index, 31);  
                        x\_declare();  
                        xts.set\_flags(XML\_SYNTAX::\_X\_LT | XML\_SYNTAX::\_X\_SPACE);  
                        break;  
                    default:  
                        if (xts.s & XML\_SYNTAX::\_X\_BEGINNAME) {  
                            xts.back(1);  
                            x\_tag();  
                            x\_text();  
                            root\_break = true;  
                        }  
                        else  
                        {  
                            err(xts.index, 2);  
                        }  
                        break;  
                    }

                    if (root\_break)  
                        break;  
                }

                if (!xts.next\_is\_flags())  
                    err(xts.index, 3);

            }

            if (cur != &(res->root)) {  
                //解析完字符串之后,如果当前标签不为null,则属于错误。  
                err(xts.size, 28);  
            }

            return 0;  
        }

        void err(xml\_size\_t \_Pos, int \_Number, const char \*\_Info = "") {  
            errp = { \_Pos, \_Number, \_Info };  
            longjmp(\_Rem, 1);  
        }

    public:  
        int load(const Basetype \*\_Text, int \_Size, xresource\_t<\_StringTy> \*pres) {  
            xts.init((const Basetype\*)\_Text, \_Size);  
            errp.number = 0;  
            errp.pos = 0;  
            res = pres;  
            cur = &(res->root);  
            return x\_root();  
        }

        void get\_errp(xerrorpos &e) {  
            e = errp;  
        }

        void get\_err\_pos(xerrorpos &e) {  
            e.line = 0;  
            e.column = 0;  
            if (!e.pos || !e.number)  
                return;  
            auto pos = xts.index - xts.cl;  
            xts.index = 0;  
            xts.c = xts.text\[0\];  
            e.line = 1;  
            e.column = 1;  
            for (; xts.index < e.pos; xts.next\_donot\_syntax())  
            {  
                if (xts.c == '\\n') {  
                    e.line++;  
                    e.column = 1;  
                }  
                else  
                {  
                    e.column++;  
                }  
            }  
            xts.index = pos;  
            xts.next();  
        }

    private:  
        friend class xelement\_t<\_XtsTy>;  
        jmp\_buf \_Rem;  
        \_XtsTy xts;  
        xerrorpos errp;  
        using \_Nodetype = typename xresource\_t<\_StringTy>::xnode;  
        \_Nodetype \*cur;  
        xresource\_t<\_StringTy> \*res;  
        \_StringTy \_strtmp\[8\];//由于使用了jmp\_buf来进行错误直接远跳,为了避免内存泄漏,所以将栈中需要的字符串对象也储存在这里  
        const Basetype\*(\*\_\_multiec\_strstr)(const void\*, const void\*);  
        std::string errinfobuffer;  
    };

    static const char \*xml\_error\_information\[\] = {  
        "",  
        "开始标签:无效的元素名称",        //1  
        "根节点之前的无效的特殊标签",        //2  
        "根节点之前的无效的字符",            //3  
        "自结束标签:此处应为 >",            //4  
        "标签属性:无效的标签属性名称",        //5  
        "标签属性:此处应为 =",            //6  
        "标签属性:此处应为 \\" 或 '",        //7  
        "标签属性:未找到对应的属性结束符(\\" 或 ')",    //8  
        "标签属性:< 不允许出现在属性值中",    //9  
        "结束标签:无效的元素名称",        //10  
        "结束标签:此处应为 >",            //11  
        "结束标签:开始标签与结束标签不匹配,参考信息:%s",    //12  
        "注释标签:无效的注释标签,此处或许应为 -",//13  
        "注释标签:未找到注释标签结束符(-->)",//14  
        "注释标签:-- 不允许单独出现在注释标签中",//15  
        "CDATA:无效的CDATA标签",//16  
        "CDATA:未找到CDATA结束符(\]\]>)",//17  
        "转义符:无效的转义符名称首字符",//18  
        "转义符:无效的转义符字符",//19  
        "转义符:%s 是未定义的实体",//20  
        "转义符:无效的转义符字符",//21  
        "字符数值转义:无效的10进制数字字符",//22  
        "字符数值转义:无效的16进制数字字符",//23  
        "字符数值转义:小于32(0x20)的字符仅允许\\\\t\\\\r\\\\n出现在xml中",//24  
        "字符数值转义:0xD800-0xDFFF为UNICODE代理字符,不允许单独出现在xml中",//25  
        "字符数值转义:字符值溢出,参考最大值(0x10FFFF)",//26  
        "转义符:无效的转义符",//27  
        "根节点未封闭",//28  
        "无效的文档:%s",//29  
        "XML声明种错误的符号,此处应为 >",//30  
        "XML声明前不允许存在其他字符",//31  
        "未找到XML声明结束符(?>)",//32  
    };

    template<typename \_XtsTy>  
    class xelement\_t {  
    public:

        using \_StringTy = typename \_XtsTy::strtype;  
        using \_Nodetype = typename xresource\_t<\_StringTy>::xnode;  
        using Basetype = typename xresource\_t<\_StringTy>::Basetype;  
        bool eof() {  
            return \_Node == nullptr;  
        }

        xelement\_t(\_Nodetype \*\_Val) {  
            \_Node = \_Val;  
        }

        bool operator==(xelement\_t &e) {  
            return e.\_Node == \_Node;  
        }

        bool operator!=(xelement\_t &e) {  
            return e.\_Node != \_Node;  
        }

        \_StringTy get\_name() {  
            if (eof()) return \_XtsTy::constval::emp;  
            return \_Node->ti.name->first;  
        }

        \_StringTy get\_attr(const \_StringTy &\_AttrName) {  
            if (eof()) return \_XtsTy::constval::emp;  
            for (auto it = \_Node->attrs.begin(); it != \_Node->attrs.end(); ++it) {  
                if (\*(it->name) == \_AttrName)  
                    return it->value->first;  
            }  
            return "";  
        }

        \_StringTy get\_text(int \_Flags = 0) {  
            if (eof()) return \_XtsTy::constval::emp;  
            \_StringTy \_Tmp;  
            auto begin = \_Node->inner.begin;  
            auto end = \_Node->inner.end;  
            if (\_Flags & 1) {  
                --begin;  
                ++end;  
            }

            for (auto it = begin; it != end; ++it) {  
                if (it->length() > 6 && it->at(0) == '<' && it->at(1) == '!' && it->at(2) == '-')  
                    continue;  
                \_Tmp += it->c\_str();  
            }

            return \_Tmp;  
        }

        \_StringTy get\_inner\_xml() {  
            if (eof()) return \_XtsTy::constval::emp;  
            \_StringTy \_Tmp;  
            for (auto it = \_Node->inner.begin; it != \_Node->inner.end; ++it) {  
                if (it->length() > 3 &&  
                    it->at(0) == '<' &&  
                    it->at(1) == '!' &&  
                    it->at(2) == '-')  
                    continue;

                if (it->length() > 4 && \*it == \_XtsTy::constval::br\_tag)  
                {  
                    \_Tmp += \_XtsTy::constval::crlf;  
                    continue;  
                }

                if (it->length() > 1 && it->at(0) == '<' && it->at(1) != '/')  
                    \_Tmp += \_XtsTy::constval::crlf;  
                \_Tmp += it->c\_str();  
            }  
            return \_Tmp;  
        }

    private:  
        friend class xdocument\_t<\_XtsTy>;  
        \_Nodetype \*\_Node;  
    };

    template<typename \_XtsTy>  
    class xdocument\_t {  
    public:  
        xdocument\_t() {  
            nodepath\_array.reserve(0x10);  
        }

        ~xdocument\_t() {  
            res.clear();  
        }

        using \_StringTy = typename \_XtsTy::strtype;  
        using \_ParserTy = xparser\_t<\_XtsTy>;  
        using Basetype = typename \_XtsTy::Basetype;  
        using \_ResourceTy = xresource\_t<\_StringTy>;  
        using \_TagIndexTy = typename \_ResourceTy::xtagindex\_t;

        using element = xelement\_t<\_XtsTy>;  
        using \_Nodetype = typename element::\_Nodetype;

        int load\_file(const \_StringTy &\_Filename) {  
            errp.pos = 0;  
            errp.line = 0;  
            errp.column = 0;  
            res.clear();  
            std::ifstream fs(\_Filename.c\_str(), std::ios::binary);  
            fs.seekg(0, std::ios::end);  
            size\_t s = (size\_t)fs.tellg();  
            fs.seekg(0, std::ios::beg);

            if (!s) {  
                errp.information.reserve(\_Filename.length() \* 3);  
                sprintf((char\*)errp.information.data(),  
                    (sizeof(Basetype) != 1) ? "%ls" : "%s", \_Filename.c\_str());

                errp.number = 29;  
                errp.pos = 0;  
                return -1;  
            }

            char \*p = new char\[s + 2\];  
            p\[s\] = 0; p\[s + 1\] = 0;  
            fs.read(p, s);  
            fs.close();  
            size\_t \_Off = 0;

            //预测文档编码,并不一定准确,只能说想到的判断都做了。  
            /\*返回值有4种:  
                0 多字节编码非utf-8  
                1 utf-16  
                2 utf-8  
                -1 错误  
            \*/

#if defined(_WIN32) || defined(_WIN64)
_SrcEncode = encode_adaptive::xmlec_predict(p, s, &(errp.number), &_Off);
if (_SrcEncode < 0) {
delete p;
errp.information.reserve(_Filename.length() * 3);
sprintf((char*)errp.information.data(),
(sizeof(Basetype) != 1) ? "%ls" : "%s", _Filename.c_str());
return -1;
}

            //很遗憾的事情是,c++17删除了编码转换库,所以,只能使用操作系统的函数来完成了。  
            //虽然这个类库并不依赖c++17,但为了以后和新标准对接,所以只能自己实现跨平台的转换策略。  
            //另外一点是,linux其实对转码没有什么需求。  
            \_StringTy \_Text;  
            if (encode\_adaptive::specifiy(p + \_Off, \_SrcEncode, \_XtsTy::\_encoding, \_Text) == \_nf) {  
                delete p;  
                errp.information.reserve(\_Filename.length() \* 3);  
                sprintf((char\*)errp.information.data(),  
                    (sizeof(Basetype) != 1) ? "%ls" : "%s", \_Filename.c\_str());  
                errp.number = 29;  
                return -1;  
            }  
            delete p;  
            \_ParserTy xp;  
            int \_Result = xp.load(\_Text.c\_str(), (xml\_size\_t)s, &res);  

#else
_ParserTy xp;
int _Result = xp.load(p, (xml_size_t)s, &res);
delete p;
#endif
res.root.inner.end = res.docs.end();
xp.get_errp(errp);
if (errp.number) xp.get_err_pos(errp);
return _Result;
}

        element get\_element(const \_StringTy &\_TagName) {  
            size\_t \_Off = 0;  
            size\_t \_Pos;  
            Basetype \*\_Ptr = (Basetype \*)\_TagName.c\_str();  
            nodepath\_array.clear();  
            auto i = res.tags.end();  
            for (;;) {  
                \_Pos = \_TagName.find('/', \_Off);  
                if (\_Pos == \_nf)  
                    break;  
                \_Ptr\[\_Pos\] = 0;  
                i = res.tags.find(\_Ptr + \_Off);  
                \_Ptr\[\_Pos\] = '/';  
                if (i == res.tags.end()) return nullptr;  
                nodepath\_array.push\_back(&(i->second));  
                \_Off = \_Pos + 1;  
            }  
            i = res.tags.find(\_Ptr + \_Off);  
            if (i == res.tags.end()) return nullptr;  
            if (!nodepath\_array.size()) return \*(i->second.begin());  
            nodepath\_array.push\_back(&(i->second));  
            return recursive\_nodepath(nullptr, 0);  
        }

        element get\_element(element &\_Parent, const \_StringTy &\_TagName) {  
            auto fit = res.tags.find(\_TagName);  
            if (fit != res.tags.end()) {  
                for (auto it = fit->second.begin(); it != fit->second.end(); ++it) {  
                    if (\_Parent->\_Node == it->\_Node)  
                        return it;  
                }  
            }  
            return nullptr;  
        }

        std::string get\_error\_info() {  
            char buf\[256\];  
            std::string \_Result;  
            if (errp.pos != 0) {  
                sprintf(buf, "XML错误位于 行(%d), 列(%d):", errp.line, errp.column);  
                \_Result += buf;  
            }  
            sprintf(buf, xml\_error\_information\[errp.number\], errp.information.c\_str());  
            \_Result += buf;  
            return \_Result;  
        }

        element root() {  
            return &(res.root);  
        }

        element end() {

            return nullptr;  
        }

    private:  
        \_Nodetype \*recursive\_nodepath(\_Nodetype \*\_Parent, size\_t i) {  
            \_TagIndexTy \*pti = nodepath\_array\[i\];  
            auto \_next = i + 1;  
            if (\_next == nodepath\_array.size())  
            {  
                for (auto it = pti->begin(); it != pti->end(); ++it) {  
                    if (!i || (\*it)->parent == \_Parent)  
                        return \*it;  
                }  
            }  
            else  
            {  
                for (auto it = pti->begin(); it != pti->end(); ++it) {  
                    if (!i || (\*it)->parent == \_Parent) {  
                        \_Nodetype \*p = recursive\_nodepath(\*it, \_next);  
                        if (p) return p;  
                    }  
                }  
            }  
            return (\_Nodetype \*)nullptr;  
        }

    private:  
        \_ResourceTy res;  
        xerrorpos errp;  
        int \_SrcEncode;  
        std::vector<\_TagIndexTy\*> nodepath\_array;  
    };  
}

#if defined(_WIN32) || defined(_WIN64)
template
using xdoc = aqx_internal::xdocument_t<_Ty>;
using xts_utf8 = aqx_internal::xts_utf8;
using xts_utf16 = aqx_internal::xts_utf16;
using xts_asc = aqx_internal::xts_asc;
#else
using xdoc = aqx_internal::xdocument_t;
#endif

}

#pragma warning(pop)

//encode_adaptive.h - windows only
#pragma once
#include
#include "tcvt.h"

#ifndef _nf
#define _nf ((size_t)-1)
#endif
namespace aqx {

namespace encode\_adaptive {

    static constexpr auto unknow{ static\_cast<int>(-1) };  
    static constexpr auto sys{ static\_cast<int>(0) };  
    static constexpr auto utf16{ static\_cast<int>(1) };  
    static constexpr auto utf8{ static\_cast<int>(2) };  
    static int profile\_predict(unsigned char \*\_Text, size\_t \_Size, int &\_Off, int \_Def = 0) {

        if (\_Size >= 3) {  
            if (\_Text\[0\] == 0xEF &&  
                \_Text\[1\] == 0xBB &&  
                \_Text\[2\] == 0xBF) {  
                \_Off = 3;  
                return 2;  
            }  
        }  
        if (\_Size >= 2) {  
            if (\_Text\[0\] == 0xFF && \_Text\[1\] == 0xFE) {  
                \_Off = 2;  
                return 1;  
            }  
        }

        \_Off = 0;  
        size\_t s = \_Size;  
        if (s > 0x10)  
            s = 0x10;  
        int x = 0;  
        for (size\_t i = 0; i < s; i++) {  
            if (\_Text\[i\] == 0)  
                x++;  
        }

        if (\_Size == s && x == 1)  
            return \_Def;  
        if (!x)  
            return \_Def;  
        return 1;  
    }

    template<typename \_Ty>  
    static int profile\_adaptive(char \*\_Text, size\_t \_Size, \_Ty &\_Result, int \_Def = 0) {  
        int \_StartOff = 0;  
        int \_SrcCode = encode\_adaptive::profile\_predict((unsigned char\*)\_Text, \_Size, \_StartOff, \_Def);  
        size\_t \_TargetCode = 0;  
        if (sizeof(decltype(\*\_Result.c\_str())) == 2)  
            \_TargetCode = 1;  
        std::wstring \_utf16;  
        if (\_SrcCode == 2)  
            aqx::utf16\_from\_utf8(\_utf16, \_Text + \_StartOff);  
        else if (\_SrcCode == 1)  
            \_utf16 = (wchar\_t\*)(\_Text + \_StartOff);  
        else  
            aqx::utf16\_from\_asc(\_utf16, \_Text + \_StartOff);  
        auto \_proc0 = \[\](void \*\_Res, std::wstring &\_wstr) { asc\_from\_utf16(\*(std::string\*)\_Res, \_wstr); };  
        auto \_proc1 = \[\](void \*\_Res, std::wstring &\_wstr) { \*(std::wstring\*)(\_Res) = \_wstr; };  
        auto \_proc2 = \[\](void \*\_Res, std::wstring &\_wstr) { aqx::utf8\_from\_utf16(\*(std::string\*)(\_Res), \_wstr); };

        if (\_TargetCode == 0)  
            \_proc0(&\_Result, \_utf16);  
        else  
            \_proc1(&\_Result, \_utf16);  
        return \_SrcCode;  
    }

    template<typename \_Ty>  
    static size\_t specifiy(char \*\_Text, int \_Srcec, int \_Targetec, \_Ty &\_Result) {  
        if (sizeof(\_Ty::\_Mybase::\_Alty::value\_type) == 1 && \_Targetec == 1)  
            return \_nf;  
        if (sizeof(\_Ty::\_Mybase::\_Alty::value\_type) == 2 && \_Targetec != 1)  
            return \_nf;  
        if (\_Srcec == 2) {

            if (\_Targetec == 2)  
            {  
                \*(std::string\*)&\_Result = (\_Text);  
                return \_Result.length();  
            }  
            else if (\_Targetec == 1)  
                return utf16\_from\_utf8(\*(std::wstring\*)&\_Result, \_Text);  
            else  
                return asc\_from\_utf8(\*(std::string\*)&\_Result, \_Text);

        }  
        else if (\_Srcec == 1)  
        {  
            if (\_Targetec == 2)  
                return utf8\_from\_utf16(\*(std::string\*)&\_Result, (wchar\_t\*)\_Text);  
            else if (\_Targetec == 1) {  
                \*(std::wstring\*)&\_Result = (wchar\_t\*)(\_Text);  
                return \_Result.length();  
            }  
            else  
                return asc\_from\_utf16(\*(std::string\*)&\_Result, (wchar\_t\*)\_Text);  
        }  
        else  
        {  
            if (\_Targetec == 2)  
                return utf8\_from\_asc(\*(std::string\*)&\_Result, \_Text);  
            else if (\_Targetec == 1)  
                return utf16\_from\_asc(\*(std::wstring\*)&\_Result, \_Text);  
            else {  
                \*(std::string\*)&\_Result = (\_Text);  
                return \_Result.length();  
            }  
        }  
        return \_nf;  
    }

    static void unknow\_append(void \*\_Res, std::string \_Str) { \*(std::string\*)(\_Res) += \_Str; }  
    static void unknow\_wappend(void \*\_Res, std::wstring \_Str) { \*(std::wstring\*)(\_Res) += \_Str; }

    static int xmlec\_nbom\_wchar(wchar\_t \*\_Text, size\_t \_Size) {  
        if (\_Size < 7) return -1;//小于7字节的xml文档是不成立的  
        auto p = wcschr(\_Text, L'<');  
        if (!p) return -1;  
        if (p\[1\] == L'?') {  
            if (p != \_Text) return -3;//xml声明没有位于xml文件头部  
            p = wcsstr(\_Text + 2, L"?>");  
            if (!p) return -4;//没有找到xml声明结尾  
        }  
        return 1;  
    }

    static int xmlec\_nbom\_char(char \*\_Text, size\_t \_Size) {  
        auto p = strchr(\_Text, '<');  
        if (!p) return -1;

        if (!p\[1\]) //找到第一个<,如果他它之后一个字符是0,则考虑它是不是utf16  
        {  
            if (p - \_Text == \_Size - 1) return -2;//如果它已经是字符串最后一个有效字符,直接报错。  
            if (\_Size % 2) return -2; //长度不是偶数,说明绝对不可能是utf16  
            return xmlec\_nbom\_wchar((wchar\_t\*)\_Text, (\_Size >> 1));  
        }

        if (p\[1\] == '?') {  
            if (p != \_Text) return -3;//xml声明没有位于xml文件头部  
            p = strstr(\_Text + 2, "?>");  
            if (!p) return -4;//没有找到xml声明结尾  
            auto s = (p - \_Text) + 2;  
            std::string str(\_Text, p - \_Text + 2);  
            std::transform(str.begin(), str.end(), str.begin(), toupper);  
            if (str.find("UTF-8") != \_nf) return 2;  
            if (str.find("GBK") != \_nf) return 0;  
            if (str.find("GB2312") != \_nf) return 0;  
        }

        return 2;  
    }

    static int xmlec\_predict(char \*\_Text, size\_t \_Size, int \*err\_number, size\_t \*\_Off = NULL, int \_Default = 2) {  
        \*err\_number = 0;  
        if (\_Size < 7) {  
            //小于7字节的xml文档是不成立的  
            \*err\_number = 29;  
            return -1;  
        }

        //先基于bom判断  
        if ((unsigned char)(\_Text\[0\]) == 0xEF && (unsigned char)(\_Text\[1\]) == 0xBB && (unsigned char)(\_Text\[2\]) == 0xBF) {  
            if (\_Off) \*\_Off = 3;  
            auto p = strchr(\_Text + 3, '<');  
            if (!p) {  
                \*err\_number = 29;  
                return -1;  
            }

            if (p\[1\] == '?')  
            {  
                if (p != \_Text + 3) {

                    \*err\_number = 31;  
                    return -1;  
                }  
                p = strstr(\_Text + 5, "?>");  
                if (!p) {  
                    \*err\_number = 32;  
                    return -1;  
                }  
            }

            return 2;  
        }  
        else if ((unsigned char)(\_Text\[0\]) == 0xFF && (unsigned char)(\_Text)\[1\] == 0xFE) {  
            if (\_Off) \*\_Off = 2;  
            auto p = wcschr((wchar\_t\*)\_Text + 1, L'<');  
            if (!p) {  
                \*err\_number = 29;  
                return -1;  
            }

            if (p\[1\] == L'?')  
            {  
                if (p != (wchar\_t\*)\_Text + 1) {  
                    \*err\_number = 31;  
                    return -1;  
                }  
                p = wcsstr((wchar\_t\*)\_Text + 3, L"?>");  
                if (!p) {  
                    \*err\_number = 32;  
                    return -1;  
                }

            }  
            return 1;  
        }

        if (\_Off) \*\_Off = 0;  
        int n = xmlec\_nbom\_char(\_Text, \_Size);  
        if (n < -1) {

            if (n == -2)  
                \*err\_number = 29;  
            else if (n == -3)  
                \*err\_number = 31;  
            else if (n == -4)  
                \*err\_number = 32;

            return -1;  
        }  
        else if (n >= 0) return n;  
        if (!(\_Size % 2))  
            n = xmlec\_nbom\_wchar((wchar\_t\*)\_Text, (\_Size >> 1));

        if (n < -1) {

            if (n == -2)  
                \*err\_number = 29;  
            else if (n == -3)  
                \*err\_number = 31;  
            else if (n == -4)  
                \*err\_number = 32;

            return -1;  
        }

        return \_Default;  
    }

};

}

//tcvt.h - windows only

#pragma once
#if defined(_WIN32) || defined(_WIN64)
#ifndef _WINDOWS_
#include
#endif
#endif

namespace aqx {

static size\_t \_mbs2wcs(int \_Cp, const std::string &\_Mbs, std::wstring &\_Wcs) {  
    int n = MultiByteToWideChar(\_Cp, 0, \_Mbs.c\_str(), (int)\_Mbs.length(), nullptr, 0);  
    \_Wcs.resize(n);  
    return MultiByteToWideChar(\_Cp, 0, \_Mbs.c\_str(), (int)\_Mbs.length(), (wchar\_t\*)\_Wcs.data(), (int)\_Wcs.capacity());  
}

static size\_t \_wcs2mbs(int \_Cp, const std::wstring &\_Wcs, std::string &\_Mbs) {  
    int n = WideCharToMultiByte(\_Cp, 0, \_Wcs.c\_str(), (int)\_Wcs.length(), nullptr, 0, NULL, FALSE);  
    \_Mbs.resize(n);  
    return WideCharToMultiByte(\_Cp, 0, \_Wcs.c\_str(), (int)\_Wcs.length(), (char\*)\_Mbs.data(), (int)\_Mbs.capacity(), NULL, FALSE);  
}

static size\_t utf8\_from\_asc(std::string &\_Result, const std::string &\_Asc) {  
    std::wstring \_Tmp;  
    \_mbs2wcs(CP\_ACP, \_Asc, \_Tmp);  
    return \_wcs2mbs(CP\_UTF8, \_Tmp, \_Result);  
}

static size\_t utf16\_from\_asc(std::wstring &\_Result, const std::string &\_Asc) {  
    return \_mbs2wcs(CP\_ACP, \_Asc, \_Result);  
}

static size\_t asc\_from\_utf8(std::string &\_Result, const std::string &\_U8s) {  
    std::wstring \_Tmp;  
    \_mbs2wcs(CP\_UTF8, \_U8s, \_Tmp);  
    return \_wcs2mbs(CP\_ACP, \_Tmp, \_Result);  
}

static size\_t utf16\_from\_utf8(std::wstring &\_Result, const std::string &\_U8s) {  
    return \_mbs2wcs(CP\_UTF8, \_U8s, \_Result);  
}

static size\_t utf8\_from\_utf16(std::string &\_Result, const std::wstring &\_Wcs) {  
    return \_wcs2mbs(CP\_UTF8, \_Wcs, \_Result);  
}

static size\_t asc\_from\_utf16(std::string &\_Result, const std::wstring &\_Wcs) {  
    return \_wcs2mbs(CP\_ACP, \_Wcs, \_Result);  
}

}

测试代码:

#include "pch.h"
#include
#include "xml.hpp"
#include

int main()
{

setlocale(LC\_ALL, "");

// 支持三种编码格式:aqx::xts\_utf16 aqx::xts\_utf8 aqx::xts\_asc  
aqx::xdoc<aqx::xts\_utf16> doc;  
auto t = clock();  
int err = doc.load\_file(L"G:\\\\vs2017\\\\test\\\\生成\\\\test.xml");  
printf("解析文档耗时:%d ms\\n", clock() - t);  
if (err) {  
    printf("%s\\n", doc.get\_error\_info().c\_str());  
    return 0;  
}  
auto e = doc.get\_element(L"CATALOG2");  
printf("%ls\\n", e.get\_inner\_xml().c\_str());  
system("pause");  
return 0;  

}

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器

你可能感兴趣的文章