Regex library Nrex initial port

This commit is contained in:
Lee Zher Huei 2015-07-24 01:18:46 +01:00
parent f697ec2fe0
commit ef005d4f64
17 changed files with 1254 additions and 1206 deletions

View file

@ -31,7 +31,7 @@
//#include "math_funcs.h"
#include <stdio.h>
#include "os/os.h"
#include "drivers/trex/regex.h"
#include "drivers/nrex/regex.h"
#include "test_string.h"
@ -463,20 +463,16 @@ bool test_26() {
OS::get_singleton()->print("\n\nTest 26: RegEx\n");
RegEx regexp("(.*):(.*)");
List<String> captures;
bool match = regexp.match("name:password", &captures);
printf("\tmatch: %s\n", match?"true":"false");
bool res = regexp.match("name:password");
printf("\tmatch: %s\n", res?"true":"false");
printf("\t%i captures:\n", captures.size());
List<String>::Element *I = captures.front();
while (I) {
printf("%ls\n", I->get().c_str());
I = I->next();
};
return captures.size();
printf("\t%i captures:\n", regexp.get_capture_count());
for (int i = 0; i<regexp.get_capture_count(); i++)
{
printf("%ls\n", regexp.get_capture(i).c_str());
}
return res;
};
struct test_27_data {

View file

@ -29,7 +29,7 @@ if (env["openssl"]=="builtin"):
SConscript("rtaudio/SCsub");
SConscript("nedmalloc/SCsub");
SConscript("trex/SCsub");
SConscript("nrex/SCsub");
SConscript("chibi/SCsub");
if (env["vorbis"]=="yes" or env["speex"]=="yes" or env["theora"]=="yes"):
SConscript("ogg/SCsub");

64
drivers/nrex/README.md Normal file
View file

@ -0,0 +1,64 @@
# NREX: Node RegEx
Small node-based regular expression library. It only does text pattern
matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp`
and `nrex_config.h` to your project and follow the example:
nrex regex;
regex.compile("^(fo+)bar$");
nrex_result captures[regex.capture_size()];
if (regex.match("foobar", captures))
{
std::cout << captures[0].start << std::endl;
std::cout << captures[0].length << std::endl;
}
More details about its use is documented in `nrex.hpp`
Currently supported features:
* Capturing `()` and non-capturing `(?:)` groups
* Any character `.`
* Shorthand caracter classes `\w\W\s\S\d\D`
* User-defined character classes such as `[A-Za-z]`
* Simple quantifiers `?`, `*` and `+`
* Range quantifiers `{0,1}`
* Lazy (non-greedy) quantifiers `*?`
* Begining `^` and end `$` anchors
* Alternation `|`
* Backreferences `\1` to `\99`
To do list:
* Unicode `\uFFFF` code points
## License
Copyright (c) 2015, Zher Huei Lee
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View file

@ -2,8 +2,7 @@
Import('env')
sources = [
'trex.c',
'nrex.cpp',
'regex.cpp',
]
env.add_source_files(env.drivers_sources, sources)

902
drivers/nrex/nrex.cpp Normal file
View file

@ -0,0 +1,902 @@
// NREX: Node RegEx
//
// Copyright (c) 2015, Zher Huei Lee
// All rights reserved.
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgment in the product documentation would
// be appreciated but is not required.
//
// 2. Altered source versions must be plainly marked as such, and must not
// be misrepresented as being the original software.
//
// 3. This notice may not be removed or altered from any source
// distribution.
//
#include "nrex.hpp"
#ifdef NREX_UNICODE
#include <wctype.h>
#include <wchar.h>
#define NREX_ISALPHANUM iswalnum
#define NREX_STRLEN wcslen
#else
#include <ctype.h>
#include <string.h>
#define NREX_ISALPHANUM isalnum
#define NREX_STRLEN strlen
#endif
#ifdef NREX_THROW_ERROR
#define NREX_COMPILE_ERROR(M) throw nrex_compile_error(M)
#else
#define NREX_COMPILE_ERROR(M) reset(); return false
#endif
#ifndef NREX_NEW
#define NREX_NEW(X) new X
#define NREX_NEW_ARRAY(X, N) new X[N]
#define NREX_DELETE(X) delete X
#define NREX_DELETE_ARRAY(X) delete[] X
#endif
template<typename T>
class nrex_array
{
private:
T* _data;
unsigned int _reserved;
unsigned int _size;
public:
nrex_array()
: _data(NREX_NEW_ARRAY(T, 2))
, _reserved(2)
, _size(0)
{
}
~nrex_array()
{
NREX_DELETE_ARRAY(_data);
}
unsigned int size() const
{
return _size;
}
void reserve(unsigned int size)
{
T* old = _data;
_data = NREX_NEW_ARRAY(T, size);
_reserved = size;
for (unsigned int i = 0; i < _size; ++i)
{
_data[i] = old[i];
}
NREX_DELETE_ARRAY(old);
}
void push(T item)
{
if (_size == _reserved)
{
reserve(_reserved * 2);
}
_data[_size] = item;
_size++;
}
T& top()
{
return _data[_size - 1];
}
const T& operator[] (unsigned int i) const
{
return _data[i];
}
void pop()
{
if (_size > 0)
{
--_size;
}
}
};
static nrex_char nrex_unescape(nrex_char repr)
{
switch (repr)
{
case '^': return '^';
case '$': return '$';
case '(': return '(';
case ')': return ')';
case '\\': return '\\';
case '.': return '.';
case '+': return '+';
case '*': return '*';
case '?': return '?';
case '-': return '-';
case 'a': return '\a';
case 'e': return '\e';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
}
return 0;
}
struct nrex_search
{
public:
const nrex_char* str;
nrex_result* captures;
int end;
bool complete;
nrex_char at(int pos)
{
return str[pos];
}
nrex_search(const nrex_char* str, nrex_result* captures)
: str(str)
, captures(captures)
, end(0)
{
}
};
struct nrex_node
{
nrex_node* next;
nrex_node* previous;
nrex_node* parent;
bool quantifiable;
nrex_node(bool quantify = false)
: next(NULL)
, previous(NULL)
, parent(NULL)
, quantifiable(quantify)
{
}
virtual ~nrex_node()
{
if (next)
{
delete next;
}
}
virtual int test(nrex_search* s, int pos) const
{
return next ? next->test(s, pos) : -1;
}
virtual int test_parent(nrex_search* s, int pos) const
{
if (next)
{
pos = next->test(s, pos);
}
if (parent && pos >= 0)
{
pos = parent->test_parent(s, pos);
}
if (pos >= 0)
{
s->complete = true;
}
return pos;
}
};
struct nrex_node_group : public nrex_node
{
int capturing;
bool negate;
nrex_array<nrex_node*> childset;
nrex_node* back;
nrex_node_group(int capturing)
: nrex_node(true)
, capturing(capturing)
, negate(false)
, back(NULL)
{
}
virtual ~nrex_node_group()
{
for (unsigned int i = 0; i < childset.size(); ++i)
{
delete childset[i];
}
}
int test(nrex_search* s, int pos) const
{
if (capturing >= 0)
{
s->captures[capturing].start = pos;
}
for (unsigned int i = 0; i < childset.size(); ++i)
{
s->complete = false;
int res = childset[i]->test(s, pos);
if (s->complete)
{
return res;
}
if ((res >= 0) != negate)
{
if (capturing >= 0)
{
s->captures[capturing].length = res - pos;
}
return next ? next->test(s, res) : res;
}
}
return -1;
}
virtual int test_parent(nrex_search* s, int pos) const
{
if (capturing >= 0)
{
s->captures[capturing].length = pos - s->captures[capturing].start;
}
return nrex_node::test_parent(s, pos);
}
void add_childset()
{
back = NULL;
}
void add_child(nrex_node* node)
{
node->parent = this;
node->previous = back;
if (back)
{
back->next = node;
}
else
{
childset.push(node);
}
back = node;
}
nrex_node* swap_back(nrex_node* node)
{
if (!back)
{
add_child(node);
return NULL;
}
nrex_node* old = back;
if (!old->previous)
{
childset.pop();
}
back = old->previous;
add_child(node);
return old;
}
};
struct nrex_node_char : public nrex_node
{
nrex_char ch;
nrex_node_char(nrex_char c)
: nrex_node(true)
, ch(c)
{
}
int test(nrex_search* s, int pos) const
{
if (s->end == pos || s->at(pos) != ch)
{
return -1;
}
return next ? next->test(s, pos + 1) : pos + 1;
}
};
struct nrex_node_range : public nrex_node
{
nrex_char start;
nrex_char end;
nrex_node_range(nrex_char s, nrex_char e)
: nrex_node(true)
, start(s)
, end(e)
{
}
int test(nrex_search* s, int pos) const
{
if (s->end == pos)
{
return -1;
}
nrex_char c = s->at(pos);
if (c < start || end < c)
{
return -1;
}
return next ? next->test(s, pos + 1) : pos + 1;
}
};
static bool nrex_is_whitespace(nrex_char repr)
{
switch (repr)
{
case ' ':
case '\t':
case '\r':
case '\n':
case '\f':
return true;
}
return false;
}
static bool nrex_is_shorthand(nrex_char repr)
{
switch (repr)
{
case 'W':
case 'w':
case 'D':
case 'd':
case 'S':
case 's':
return true;
}
return false;
}
struct nrex_node_shorthand : public nrex_node
{
nrex_char repr;
nrex_node_shorthand(nrex_char c)
: nrex_node(true)
, repr(c)
{
}
int test(nrex_search* s, int pos) const
{
if (s->end == pos)
{
return -1;
}
bool found = false;
bool invert = false;
nrex_char c = s->at(pos);
switch (repr)
{
case '.':
found = true;
break;
case 'W':
invert = true;
case 'w':
if (c == '_' || NREX_ISALPHANUM(c))
{
found = true;
}
break;
case 'D':
invert = true;
case 'd':
if ('0' <= c && c <= '9')
{
found = true;
}
break;
case 'S':
invert = true;
case 's':
if (nrex_is_whitespace(c))
{
found = true;
}
break;
}
if (found == invert)
{
return -1;
}
return next ? next->test(s, pos + 1) : pos + 1;
}
};
static bool nrex_is_quantifier(nrex_char repr)
{
switch (repr)
{
case '?':
case '*':
case '+':
case '{':
return true;
}
return false;
}
struct nrex_node_quantifier : public nrex_node
{
int min;
int max;
bool greedy;
nrex_node* child;
nrex_node_quantifier()
: nrex_node()
, min(0)
, max(0)
, greedy(true)
, child(NULL)
{
}
virtual ~nrex_node_quantifier()
{
if (child)
{
delete child;
}
}
int test(nrex_search* s, int pos) const
{
nrex_array<int> backtrack;
backtrack.push(pos);
s->complete = false;
while (backtrack.top() <= s->end)
{
if (max >= 1 && backtrack.size() > (unsigned int)max)
{
break;
}
if (!greedy && (unsigned int)min < backtrack.size())
{
int res = backtrack.top();
if (next)
{
res = next->test(s, res);
}
if (s->complete)
{
return res;
}
if (res >= 0 && parent->test_parent(s, res) >= 0)
{
return res;
}
}
s->complete = false;
int res = child->test(s, backtrack.top());
if (s->complete)
{
return res;
}
if (res < 0 || res == backtrack.top())
{
break;
}
backtrack.push(res);
}
while (greedy && (unsigned int) min < backtrack.size())
{
s->complete = false;
int res = backtrack.top();
if (s->complete)
{
return res;
}
if (next)
{
res = next->test(s, res);
}
if (res >= 0 && parent->test_parent(s, res) >= 0)
{
return res;
}
backtrack.pop();
}
return -1;
}
};
struct nrex_node_anchor : public nrex_node
{
bool end;
nrex_node_anchor(bool end)
: nrex_node()
, end(end)
{
}
int test(nrex_search* s, int pos) const
{
if (!end && pos != 0)
{
return -1;
}
else if (end && pos != s->end)
{
return -1;
}
return next ? next->test(s, pos) : pos;
}
};
struct nrex_node_backreference : public nrex_node
{
int ref;
nrex_node_backreference(int ref)
: nrex_node(true)
, ref(ref)
{
}
int test(nrex_search* s, int pos) const
{
nrex_result& r = s->captures[ref];
for (int i = 0; i < r.length; ++i)
{
if (pos + i >= s->end)
{
return -1;
}
if (s->at(r.start + i) != s->at(pos + i))
{
return -1;
}
}
return next ? next->test(s, pos + r.length) : pos + r.length;
}
};
nrex::nrex()
: _capturing(0)
, _root(NULL)
{
}
nrex::~nrex()
{
if (_root)
{
delete _root;
}
}
bool nrex::valid() const
{
return (_root != NULL);
}
void nrex::reset()
{
_capturing = 0;
if (_root)
{
delete _root;
}
_root = NULL;
}
int nrex::capture_size() const
{
return _capturing + 1;
}
bool nrex::compile(const nrex_char* pattern)
{
reset();
nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing));
nrex_array<nrex_node_group*> stack;
stack.push(root);
_root = root;
for (const nrex_char* c = pattern; c[0] != '\0'; ++c)
{
if (c[0] == '(')
{
if (c[1] == '?')
{
if (c[2] == ':')
{
c = &c[2];
nrex_node_group* group = NREX_NEW(nrex_node_group(-1));
stack.top()->add_child(group);
stack.push(group);
}
else
{
NREX_COMPILE_ERROR("unrecognised qualifier for parenthesis");
}
}
else if (_capturing < 99)
{
nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing));
stack.top()->add_child(group);
stack.push(group);
}
else
{
nrex_node_group* group = NREX_NEW(nrex_node_group(-1));
stack.top()->add_child(group);
stack.push(group);
}
}
else if (c[0] == ')')
{
if (stack.size() > 1)
{
stack.pop();
}
else
{
NREX_COMPILE_ERROR("unexpected ')'");
}
}
else if (c[0] == '[')
{
nrex_node_group* group = NREX_NEW(nrex_node_group(-1));
stack.top()->add_child(group);
if (c[1] == '^')
{
group->negate = true;
++c;
}
while (true)
{
group->add_childset();
++c;
if (c[0] == '\0')
{
NREX_COMPILE_ERROR("unclosed character class '[]'");
}
if (c[0] == ']')
{
break;
}
else if (c[0] == '\\')
{
nrex_char unescaped = nrex_unescape(c[1]);
if (unescaped)
{
group->add_child(NREX_NEW(nrex_node_char(unescaped)));
++c;
}
else if (nrex_is_shorthand(c[1]))
{
group->add_child(NREX_NEW(nrex_node_shorthand(c[1])));
++c;
}
else
{
NREX_COMPILE_ERROR("escape token not recognised");
}
}
else
{
if (c[1] == '-' && c[2] != '\0')
{
bool range = false;
if ('A' <= c[0] && c[0] <= 'Z' && 'A' <= c[2] && c[2] <= 'Z')
{
range = true;
}
if ('a' <= c[0] && c[0] <= 'z' && 'a' <= c[2] && c[2] <= 'z')
{
range = true;
}
if ('0' <= c[0] && c[0] <= '9' && '0' <= c[2] && c[2] <= '9')
{
range = true;
}
if (range)
{
group->add_child(NREX_NEW(nrex_node_range(c[0], c[2])));
c = &c[2];
continue;
}
}
group->add_child(NREX_NEW(nrex_node_char(c[0])));
}
}
}
else if (nrex_is_quantifier(c[0]))
{
nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier);
quant->child = stack.top()->swap_back(quant);
if (quant->child == NULL || !quant->child->quantifiable)
{
NREX_COMPILE_ERROR("element not quantifiable");
}
quant->child->previous = NULL;
quant->child->next = NULL;
quant->child->parent = quant;
if (c[0] == '?')
{
quant->min = 0;
quant->max = 1;
}
else if (c[0] == '+')
{
quant->min = 1;
quant->max = -1;
}
else if (c[0] == '*')
{
quant->min = 0;
quant->max = -1;
}
else if (c[0] == '{')
{
bool max_set = false;
quant->min = 0;
quant->max = -1;
while (true)
{
++c;
if (c[0] == '\0')
{
NREX_COMPILE_ERROR("unclosed range quantifier '{}'");
}
else if (c[0] == '}')
{
break;
}
else if (c[0] == ',')
{
max_set = true;
continue;
}
else if (c[0] < '0' || '9' < c[0])
{
NREX_COMPILE_ERROR("expected numeric digits, ',' or '}'");
}
if (max_set)
{
if (quant->max < 0)
{
quant->max = int(c[0] - '0');
}
else
{
quant->max = quant->max * 10 + int(c[0] - '0');
}
}
else
{
quant->min = quant->min * 10 + int(c[0] - '0');
}
}
if (!max_set)
{
quant->max = quant->min;
}
}
if (c[1] == '?')
{
quant->greedy = false;
++c;
}
}
else if (c[0] == '|')
{
stack.top()->add_childset();
}
else if (c[0] == '^' || c[0] == '$')
{
stack.top()->add_child(NREX_NEW(nrex_node_anchor((c[0] == '$'))));
}
else if (c[0] == '.')
{
stack.top()->add_child(NREX_NEW(nrex_node_shorthand('.')));
}
else if (c[0] == '\\')
{
nrex_char unescaped = nrex_unescape(c[1]);
if (unescaped)
{
stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped)));
++c;
}
else if (nrex_is_shorthand(c[1]))
{
stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1])));
++c;
}
else if ('1' <= c[1] && c[1] <= '9')
{
int ref = 0;
if ('0' <= c[2] && c[2] <= '9')
{
ref = int(c[1] - '0') * 10 + int(c[2] - '0');
c = &c[2];
}
else
{
ref = int(c[1] - '0');
++c;
}
if (ref > _capturing)
{
NREX_COMPILE_ERROR("backreference to non-existent capture");
}
stack.top()->add_child(NREX_NEW(nrex_node_backreference(ref)));
}
else
{
NREX_COMPILE_ERROR("escape token not recognised");
}
}
else
{
stack.top()->add_child(NREX_NEW(nrex_node_char(c[0])));
}
}
return true;
}
bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int end) const
{
nrex_search s(str, captures);
if (end >= offset)
{
s.end = end;
}
else
{
s.end = NREX_STRLEN(str);
}
for (int i = offset; i < s.end; ++i)
{
for (int c = 0; c <= _capturing; ++c)
{
captures[c].start = 0;
captures[c].length = 0;
}
if (_root->test(&s, i) >= 0)
{
return true;
}
}
return false;
}

144
drivers/nrex/nrex.hpp Normal file
View file

@ -0,0 +1,144 @@
// NREX: Node RegEx
//
// Copyright (c) 2015, Zher Huei Lee
// All rights reserved.
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgment in the product documentation would
// be appreciated but is not required.
//
// 2. Altered source versions must be plainly marked as such, and must not
// be misrepresented as being the original software.
//
// 3. This notice may not be removed or altered from any source
// distribution.
//
#ifndef NREX_HPP
#define NREX_HPP
#include "nrex_config.h"
#ifdef NREX_UNICODE
typedef wchar_t nrex_char;
#else
typedef char nrex_char;
#endif
/*!
* \brief Struct to contain the range of a capture result
*
* The range provided is relative to the begining of the searched string.
*
* \see nrex_node::match()
*/
struct nrex_result
{
public:
int start; /*!< Start of text range */
int length; /*!< Length of text range */
};
class nrex_node;
/*!
* \brief Holds the compiled regex pattern
*/
class nrex
{
private:
int _capturing;
nrex_node* _root;
public:
nrex();
~nrex();
/*!
* \brief Removes the compiled regex and frees up the memory
*/
void reset();
/*!
* \brief Checks if there is a compiled regex being stored
* \return True if present, False if not present
*/
bool valid() const;
/*!
* \brief Provides number of captures the compiled regex uses
*
* This is used to provide the array size of the captures needed for
* nrex::match() to work. The size is actually the number of capture
* groups + one for the matching of the entire pattern. The result is
* always capped at 100.
*
* \return The number of captures
*/
int capture_size() const;
/*!
* \brief Compiles the provided regex pattern
*
* This automatically removes the existing compiled regex if already
* present.
*
* If the NREX_THROW_ERROR was defined it would automatically throw a
* runtime error nrex_compile_error if it encounters a problem when
* parsing the pattern.
*
* \param The regex pattern
* \return True if the pattern was succesfully compiled
*/
bool compile(const nrex_char* pattern);
/*!
* \brief Uses the pattern to search through the provided string
* \param str The text to search through. It only needs to be
* null terminated if the end point is not provided.
* This also determines the starting anchor.
* \param captures The array of results to store the capture results.
* The size of that array needs to be the same as the
* size given in nrex::capture_size(). As it matches
* the function fills the array with the results. 0 is
* the result for the entire pattern, 1 and above
* corresponds to the regex capture group if present.
* \param offset The starting point of the search. This does not move
* the starting anchor. Defaults to 0.
* \param end The end point of the search. This also determines
* the ending anchor. If a number less than the offset
* is provided, the search would be done until null
* termination. Defaults to -1.
* \return True if a match was found. False otherwise.
*/
bool match(const nrex_char* str, nrex_result* captures, int offset = 0, int end = -1) const;
};
#ifdef NREX_THROW_ERROR
#include <stdexcept>
class nrex_compile_error : std::runtime_error
{
public:
nrex_compile_error(const char* message)
: std::runtime_error(message)
{
}
~nrex_compile_error() throw()
{
}
};
#endif
#endif // NREX_HPP

View file

@ -0,0 +1,12 @@
// Godot-specific configuration
// To use this, replace nrex_config.h
#include "core/os/memory.h"
#define NREX_UNICODE
//#define NREX_THROW_ERROR
#define NREX_NEW(X) memnew(X)
#define NREX_NEW_ARRAY(X, N) memnew_arr(X, N)
#define NREX_DELETE(X) memdelete(X)
#define NREX_DELETE_ARRAY(X) memdelete_arr(X)

112
drivers/nrex/regex.cpp Normal file
View file

@ -0,0 +1,112 @@
/*************************************************/
/* regex.cpp */
/*************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/*************************************************/
/* Source code within this file is: */
/* (c) 2007-2010 Juan Linietsky, Ariel Manzur */
/* All Rights Reserved. */
/*************************************************/
#include "regex.h"
#include "nrex.hpp"
#include "core/os/memory.h"
void RegEx::_bind_methods() {
ObjectTypeDB::bind_method(_MD("compile","pattern"),&RegEx::compile);
ObjectTypeDB::bind_method(_MD("match","text","start","end"),&RegEx::match, DEFVAL(0), DEFVAL(-1));
ObjectTypeDB::bind_method(_MD("get_capture","capture"),&RegEx::get_capture);
ObjectTypeDB::bind_method(_MD("get_capture_list"),&RegEx::_bind_get_capture_list);
};
StringArray RegEx::_bind_get_capture_list() const {
StringArray ret;
int count = get_capture_count();
for (int i=0; i<count; i++) {
String c = get_capture(i);
ret.push_back(c);
};
return ret;
};
void RegEx::clear() {
text.clear();
captures.clear();
exp.reset();
};
bool RegEx::is_valid() const {
return exp.valid();
};
int RegEx::get_capture_count() const {
return exp.capture_size();
}
String RegEx::get_capture(int capture) const {
ERR_FAIL_COND_V( get_capture_count() <= capture, String() );
return text.substr(captures[capture].start, captures[capture].length);
}
Error RegEx::compile(const String& p_pattern) {
clear();
exp.compile(p_pattern.c_str());
ERR_FAIL_COND_V( !exp.valid(), FAILED );
captures.resize(exp.capture_size());
return OK;
};
bool RegEx::match(const String& p_text, int p_start, int p_end) const {
ERR_FAIL_COND_V( !exp.valid(), false );
ERR_FAIL_COND_V( p_text.length() < p_start, false );
ERR_FAIL_COND_V( p_text.length() < p_end, false );
bool res = exp.match(p_text.c_str(), &captures[0], p_start, p_end);
if (res) {
text = p_text;
return true;
}
text.clear();
return false;
};
RegEx::RegEx(const String& p_pattern) {
compile(p_pattern);
};
RegEx::RegEx() {
};
RegEx::~RegEx() {
clear();
};

View file

@ -13,34 +13,31 @@
#define REGEX_H
#include "ustring.h"
#include "list.h"
#include "vector.h"
#include "core/reference.h"
struct TRex;
#include "nrex.hpp"
class RegEx : public Reference {
OBJ_TYPE(RegEx, Reference);
mutable String text;
TRex *exp;
mutable Vector<nrex_result> captures;
nrex exp;
protected:
static void _bind_methods();
StringArray _bind_get_capture_list() const;
int _bind_find(const String& p_text, int p_start = 0, int p_end = -1) const;
StringArray _bind_get_captures() const;
public:
void clear();
Error compile(const String& p_pattern);
bool is_valid() const;
bool match(const String& p_text, List<String>* p_captures = NULL, int p_start = 0, int p_end = -1) const;
bool find(const String& p_text, int& p_rstart, int &p_rend, List<String>* p_captures = NULL, int p_start = 0, int p_end = -1) const;
int get_capture_count() const;
Error get_capture_limits(int p_capture, int& p_start, int& p_len) const;
String get_capture(int p_idx) const;
String get_capture(int capture) const;
Error compile(const String& p_pattern);
bool match(const String& p_text, int p_start = 0, int p_end = -1) const;
RegEx();
RegEx(const String& p_pattern);

View file

@ -48,7 +48,7 @@
#endif
#include "drivers/trex/regex.h"
#include "drivers/nrex/regex.h"
#ifdef MUSEPACK_ENABLED
#include "mpc/audio_stream_mpc.h"

View file

@ -1,75 +0,0 @@
#ifndef _TREXPP_H_
#define _TREXPP_H_
/***************************************************************
T-Rex a tiny regular expression library
Copyright (C) 2003-2004 Alberto Demichelis
This software is provided 'as-is', without any express
or implied warranty. In no event will the authors be held
liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for
any purpose, including commercial applications, and to alter
it and redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented;
you must not claim that you wrote the original software.
If you use this software in a product, an acknowledgment
in the product documentation would be appreciated but
is not required.
2. Altered source versions must be plainly marked as such,
and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any
source distribution.
****************************************************************/
extern "C" {
#include "trex.h"
}
struct TRexParseException{TRexParseException(const TRexChar *c):desc(c){}const TRexChar *desc;};
class TRexpp {
public:
TRexpp() { _exp = (TRex *)0; }
~TRexpp() { CleanUp(); }
// compiles a regular expression
void Compile(const TRexChar *pattern) {
const TRexChar *error;
CleanUp();
if(!(_exp = trex_compile(pattern,&error)))
throw TRexParseException(error);
}
// return true if the given text match the expression
bool Match(const TRexChar* text) {
return _exp?(trex_match(_exp,text) != 0):false;
}
// Searches for the first match of the expression in a zero terminated string
bool Search(const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end) {
return _exp?(trex_search(_exp,text,out_begin,out_end) != 0):false;
}
// Searches for the first match of the expression in a string sarting at text_begin and ending at text_end
bool SearchRange(const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end) {
return _exp?(trex_searchrange(_exp,text_begin,text_end,out_begin,out_end) != 0):false;
}
bool GetSubExp(int n, const TRexChar** out_begin, int *out_len)
{
TRexMatch match;
TRexBool res = _exp?(trex_getsubexp(_exp,n,&match)):TRex_False;
if(res) {
*out_begin = match.begin;
*out_len = match.len;
return true;
}
return false;
}
int GetSubExpCount() { return _exp?trex_getsubexpcount(_exp):0; }
private:
void CleanUp() { if(_exp) trex_free(_exp); _exp = (TRex *)0; }
TRex *_exp;
};
#endif //_TREXPP_H_

View file

@ -1,15 +0,0 @@
===version 1.3
-fixed a bug for GCC users(thx Brendan)
===version 1.2
-added word boundary match \b and \B
-added vertical tab escape \v
-\w now also matches '_' (underscore)
-fixed greediness for * and +
===version 1.1 , April 1, 2004
-fixed some minor bug
-added predefined character classes(\w,\W,\s,\S etc...)
===version 1.0 , February 23, 2004
-first public realase

View file

@ -1,171 +0,0 @@
T-REX 1.3 http://tiny-rex.sourceforge.net
----------------------------------------------------------------------
T-Rex a tiny regular expression library
Copyright (C) 2003-2006 Alberto Demichelis
This software is provided 'as-is', without any express
or implied warranty. In no event will the authors be held
liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for
any purpose, including commercial applications, and to alter
it and redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented;
you must not claim that you wrote the original software.
If you use this software in a product, an acknowledgment
in the product documentation would be appreciated but
is not required.
2. Altered source versions must be plainly marked as such,
and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any
source distribution.
----------------------------------------------------------------------
TRex implements the following expressions
\ Quote the next metacharacter
^ Match the beginning of the string
. Match any character
$ Match the end of the string
| Alternation
() Grouping (creates a capture)
[] Character class
==GREEDY CLOSURES==
* Match 0 or more times
+ Match 1 or more times
? Match 1 or 0 times
{n} Match exactly n times
{n,} Match at least n times
{n,m} Match at least n but not more than m times
==ESCAPE CHARACTERS==
\t tab (HT, TAB)
\n newline (LF, NL)
\r return (CR)
\f form feed (FF)
==PREDEFINED CLASSES==
\l lowercase next char
\u uppercase next char
\a letters
\A non letters
\w alphanimeric [0-9a-zA-Z]
\W non alphanimeric
\s space
\S non space
\d digits
\D non nondigits
\x exadecimal digits
\X non exadecimal digits
\c control charactrs
\C non control charactrs
\p punctation
\P non punctation
\b word boundary
\B non word boundary
----------------------------------------------------------------------
API DOC
----------------------------------------------------------------------
TRex *trex_compile(const TRexChar *pattern,const TRexChar **error);
compiles an expression and returns a pointer to the compiled version.
in case of failure returns NULL.The returned object has to be deleted
through the function trex_free().
pattern
a pointer to a zero terminated string containing the pattern that
has to be compiled.
error
apointer to a string pointer that will be set with an error string
in case of failure.
----------------------------------------------------------------------
void trex_free(TRex *exp)
deletes a expression structure created with trex_compile()
exp
the expression structure that has to be deleted
----------------------------------------------------------------------
TRexBool trex_match(TRex* exp,const TRexChar* text)
returns TRex_True if the string specified in the parameter text is an
exact match of the expression, otherwise returns TRex_False.
exp
the compiled expression
text
the string that has to be tested
----------------------------------------------------------------------
TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end)
searches the first match of the expressin in the string specified in the parameter text.
if the match is found returns TRex_True and the sets out_begin to the beginning of the
match and out_end at the end of the match; otherwise returns TRex_False.
exp
the compiled expression
text
the string that has to be tested
out_begin
a pointer to a string pointer that will be set with the beginning of the match
out_end
a pointer to a string pointer that will be set with the end of the match
----------------------------------------------------------------------
TREX_API TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end)
searches the first match of the expressin in the string delimited
by the parameter text_begin and text_end.
if the match is found returns TRex_True and the sets out_begin to the beginning of the
match and out_end at the end of the match; otherwise returns TRex_False.
exp
the compiled expression
text_begin
a pointer to the beginnning of the string that has to be tested
text_end
a pointer to the end of the string that has to be tested
out_begin
a pointer to a string pointer that will be set with the beginning of the match
out_end
a pointer to a string pointer that will be set with the end of the match
----------------------------------------------------------------------
int trex_getsubexpcount(TRex* exp)
returns the number of sub expressions matched by the expression
exp
the compiled expression
---------------------------------------------------------------------
TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *submatch)
retrieve the begin and and pointer to the length of the sub expression indexed
by n. The result is passed trhough the struct TRexMatch:
typedef struct {
const TRexChar *begin;
int len;
} TRexMatch;
the function returns TRex_True if n is valid index otherwise TRex_False.
exp
the compiled expression
n
the index of the submatch
submatch
a pointer to structure that will store the result
this function works also after a match operation has been performend.

View file

@ -1,163 +0,0 @@
/*************************************************/
/* regex.cpp */
/*************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/*************************************************/
/* Source code within this file is: */
/* (c) 2007-2010 Juan Linietsky, Ariel Manzur */
/* All Rights Reserved. */
/*************************************************/
#include "regex.h"
extern "C" {
#define _UNICODE
#include "trex.h"
};
void RegEx::_bind_methods() {
ObjectTypeDB::bind_method(_MD("compile","pattern"),&RegEx::compile);
ObjectTypeDB::bind_method(_MD("find","text", "start","end"),&RegEx::_bind_find, DEFVAL(0), DEFVAL(-1));
ObjectTypeDB::bind_method(_MD("get_captures"),&RegEx::_bind_get_captures);
};
Error RegEx::compile(const String& p_pattern) {
clear();
const TRexChar* error;
exp = trex_compile(p_pattern.c_str(), &error);
ERR_FAIL_COND_V(!exp, FAILED);
return OK;
};
int RegEx::_bind_find(const String& p_text, int p_start, int p_end) const {
int start, end;
bool ret = find(p_text, start, end, NULL, p_start, p_end);
return ret?start:-1;
};
bool RegEx::find(const String& p_text, int& p_rstart, int &p_rend, List<String>* p_captures, int p_start, int p_end) const {
ERR_FAIL_COND_V( !exp, false );
text=p_text;
const CharType* str = p_text.c_str();
const CharType* start = str + p_start;
const CharType* end = str + (p_end == -1?p_text.size():p_end);
const CharType* out_begin;
const CharType* out_end;
bool ret = trex_searchrange(exp, start, end, &out_begin, &out_end);
if (ret) {
p_rstart = out_begin - str;
p_rend = out_end - str;
if (p_captures) {
int count = get_capture_count();
for (int i=0; i<count; i++) {
int start, len;
get_capture_limits(i, start, len);
p_captures->push_back(p_text.substr(start, len));
};
};
} else {
p_rstart = -1;
};
return ret;
};
bool RegEx::match(const String& p_text, List<String>* p_captures, int p_start, int p_end) const {
ERR_FAIL_COND_V( !exp, false );
int start, end;
return find(p_text, start, end, p_captures, p_start, p_end);
};
int RegEx::get_capture_count() const {
ERR_FAIL_COND_V( exp == NULL, -1 );
return trex_getsubexpcount(exp);
};
Error RegEx::get_capture_limits(int p_capture, int& p_start, int& p_len) const {
ERR_FAIL_COND_V( exp == NULL, ERR_UNCONFIGURED );
TRexMatch match;
TRexBool res = trex_getsubexp(exp, p_capture, &match);
ERR_FAIL_COND_V( !res, FAILED );
p_start = (int)(match.begin - text.c_str());
p_len = match.len;
return OK;
};
String RegEx::get_capture(int p_idx) const {
ERR_FAIL_COND_V( exp == NULL, "" );
int start, len;
Error ret = get_capture_limits(p_idx, start, len);
ERR_FAIL_COND_V(ret != OK, "");
if (len == 0)
return "";
return text.substr(start, len);
};
StringArray RegEx::_bind_get_captures() const {
StringArray ret;
int count = get_capture_count();
for (int i=0; i<count; i++) {
String c = get_capture(i);
ret.push_back(c);
};
return ret;
};
bool RegEx::is_valid() const {
return exp != NULL;
};
void RegEx::clear() {
if (exp) {
trex_free(exp);
exp = NULL;
};
};
RegEx::RegEx(const String& p_pattern) {
exp = NULL;
compile(p_pattern);
};
RegEx::RegEx() {
exp = NULL;
};
RegEx::~RegEx() {
clear();
};

View file

@ -1,41 +0,0 @@
#include "trex.h"
#include <stdio.h>
#include <string.h>
#ifdef _UNICODE
#define trex_sprintf swprintf
#else
#define trex_sprintf sprintf
#endif
int main(int argc, char* argv[])
{
const TRexChar *begin,*end;
TRexChar sTemp[200];
const TRexChar *error = NULL;
TRex *x = trex_compile(_TREXC("(x{1,5})xx"),&error);
if(x) {
trex_sprintf(sTemp,_TREXC("xxxxxxx"));
if(trex_search(x,sTemp,&begin,&end))
{
int i,n = trex_getsubexpcount(x);
TRexMatch match;
for(i = 0; i < n; i++)
{
TRexChar t[200];
trex_getsubexp(x,i,&match);
trex_sprintf(t,_TREXC("[%%d]%%.%ds\n"),match.len);
trex_printf(t,i,match.begin);
}
trex_printf(_TREXC("match! %d sub matches\n"),trex_getsubexpcount(x));
}
else {
trex_printf(_TREXC("no match!\n"));
}
trex_free(x);
}
else {
trex_printf(_TREXC("compilation error [%s]!\n"),error?error:_TREXC("undefined"));
}
return 0;
}

View file

@ -1,643 +0,0 @@
/* see copyright notice in trex.h */
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <setjmp.h>
#include "trex.h"
#ifdef _UINCODE
#define scisprint iswprint
#define scstrlen wcslen
#define scprintf wprintf
#define _SC(x) L##c
#else
#define scisprint isprint
#define scstrlen strlen
#define scprintf printf
#define _SC(x) (x)
#endif
#ifdef _DEBUG
#include <stdio.h>
static const TRexChar *g_nnames[] =
{
_SC("NONE"),_SC("OP_GREEDY"), _SC("OP_OR"),
_SC("OP_EXPR"),_SC("OP_NOCAPEXPR"),_SC("OP_DOT"), _SC("OP_CLASS"),
_SC("OP_CCLASS"),_SC("OP_NCLASS"),_SC("OP_RANGE"),_SC("OP_CHAR"),
_SC("OP_EOL"),_SC("OP_BOL"),_SC("OP_WB")
};
#endif
#define OP_GREEDY (MAX_CHAR+1) // * + ? {n}
#define OP_OR (MAX_CHAR+2)
#define OP_EXPR (MAX_CHAR+3) //parentesis ()
#define OP_NOCAPEXPR (MAX_CHAR+4) //parentesis (?:)
#define OP_DOT (MAX_CHAR+5)
#define OP_CLASS (MAX_CHAR+6)
#define OP_CCLASS (MAX_CHAR+7)
#define OP_NCLASS (MAX_CHAR+8) //negates class the [^
#define OP_RANGE (MAX_CHAR+9)
#define OP_CHAR (MAX_CHAR+10)
#define OP_EOL (MAX_CHAR+11)
#define OP_BOL (MAX_CHAR+12)
#define OP_WB (MAX_CHAR+13)
#define TREX_SYMBOL_ANY_CHAR ('.')
#define TREX_SYMBOL_GREEDY_ONE_OR_MORE ('+')
#define TREX_SYMBOL_GREEDY_ZERO_OR_MORE ('*')
#define TREX_SYMBOL_GREEDY_ZERO_OR_ONE ('?')
#define TREX_SYMBOL_BRANCH ('|')
#define TREX_SYMBOL_END_OF_STRING ('$')
#define TREX_SYMBOL_BEGINNING_OF_STRING ('^')
#define TREX_SYMBOL_ESCAPE_CHAR ('\\')
typedef int TRexNodeType;
typedef struct tagTRexNode{
TRexNodeType type;
int left;
int right;
int next;
}TRexNode;
struct TRex{
const TRexChar *_eol;
const TRexChar *_bol;
const TRexChar *_p;
int _first;
int _op;
TRexNode *_nodes;
int _nallocated;
int _nsize;
int _nsubexpr;
TRexMatch *_matches;
int _currsubexp;
void *_jmpbuf;
const TRexChar **_error;
};
static int trex_list(TRex *exp);
static int trex_newnode(TRex *exp, TRexNodeType type)
{
TRexNode n;
int newid;
n.type = type;
n.next = n.right = n.left = -1;
if(type == OP_EXPR)
n.right = exp->_nsubexpr++;
if(exp->_nallocated < (exp->_nsize + 1)) {
//int oldsize = exp->_nallocated;
exp->_nallocated *= 2;
exp->_nodes = (TRexNode *)realloc(exp->_nodes, exp->_nallocated * sizeof(TRexNode));
}
exp->_nodes[exp->_nsize++] = n;
newid = exp->_nsize - 1;
return (int)newid;
}
static void trex_error(TRex *exp,const TRexChar *error)
{
if(exp->_error) *exp->_error = error;
longjmp(*((jmp_buf*)exp->_jmpbuf),-1);
}
static void trex_expect(TRex *exp, int n){
if((*exp->_p) != n)
trex_error(exp, _SC("expected paren"));
exp->_p++;
}
static TRexChar trex_escapechar(TRex *exp)
{
if(*exp->_p == TREX_SYMBOL_ESCAPE_CHAR){
exp->_p++;
switch(*exp->_p) {
case 'v': exp->_p++; return '\v';
case 'n': exp->_p++; return '\n';
case 't': exp->_p++; return '\t';
case 'r': exp->_p++; return '\r';
case 'f': exp->_p++; return '\f';
default: return (*exp->_p++);
}
} else if(!scisprint(*exp->_p)) trex_error(exp,_SC("letter expected"));
return (*exp->_p++);
}
static int trex_charclass(TRex *exp,int classid)
{
int n = trex_newnode(exp,OP_CCLASS);
exp->_nodes[n].left = classid;
return n;
}
static int trex_charnode(TRex *exp,TRexBool isclass)
{
TRexChar t;
if(*exp->_p == TREX_SYMBOL_ESCAPE_CHAR) {
exp->_p++;
switch(*exp->_p) {
case 'n': exp->_p++; return trex_newnode(exp,'\n');
case 't': exp->_p++; return trex_newnode(exp,'\t');
case 'r': exp->_p++; return trex_newnode(exp,'\r');
case 'f': exp->_p++; return trex_newnode(exp,'\f');
case 'v': exp->_p++; return trex_newnode(exp,'\v');
case 'a': case 'A': case 'w': case 'W': case 's': case 'S':
case 'd': case 'D': case 'x': case 'X': case 'c': case 'C':
case 'p': case 'P': case 'l': case 'u':
{
t = *exp->_p; exp->_p++;
return trex_charclass(exp,t);
}
case 'b':
case 'B':
if(!isclass) {
int node = trex_newnode(exp,OP_WB);
exp->_nodes[node].left = *exp->_p;
exp->_p++;
return node;
} //else default
default:
t = *exp->_p; exp->_p++;
return trex_newnode(exp,t);
}
}
else if(!scisprint(*exp->_p)) {
trex_error(exp,_SC("letter expected"));
}
t = *exp->_p; exp->_p++;
return trex_newnode(exp,t);
}
static int trex_class(TRex *exp)
{
int ret = -1;
int first = -1,chain;
if(*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING){
ret = trex_newnode(exp,OP_NCLASS);
exp->_p++;
}else ret = trex_newnode(exp,OP_CLASS);
if(*exp->_p == ']') trex_error(exp,_SC("empty class"));
chain = ret;
while(*exp->_p != ']' && exp->_p != exp->_eol) {
if(*exp->_p == '-' && first != -1){
int r,t;
if(*exp->_p++ == ']') trex_error(exp,_SC("unfinished range"));
r = trex_newnode(exp,OP_RANGE);
if(first>*exp->_p) trex_error(exp,_SC("invalid range"));
if(exp->_nodes[first].type == OP_CCLASS) trex_error(exp,_SC("cannot use character classes in ranges"));
exp->_nodes[r].left = exp->_nodes[first].type;
t = trex_escapechar(exp);
exp->_nodes[r].right = t;
exp->_nodes[chain].next = r;
chain = r;
first = -1;
}
else{
if(first!=-1){
int c = first;
exp->_nodes[chain].next = c;
chain = c;
first = trex_charnode(exp,TRex_True);
}
else{
first = trex_charnode(exp,TRex_True);
}
}
}
if(first!=-1){
int c = first;
exp->_nodes[chain].next = c;
chain = c;
first = -1;
}
/* hack? */
exp->_nodes[ret].left = exp->_nodes[ret].next;
exp->_nodes[ret].next = -1;
return ret;
}
static int trex_parsenumber(TRex *exp)
{
int ret = *exp->_p-'0';
int positions = 10;
exp->_p++;
while(isdigit(*exp->_p)) {
ret = ret*10+(*exp->_p++-'0');
if(positions==1000000000) trex_error(exp,_SC("overflow in numeric constant"));
positions *= 10;
};
return ret;
}
static int trex_element(TRex *exp)
{
int ret = -1;
switch(*exp->_p)
{
case '(': {
int expr,newn;
exp->_p++;
if(*exp->_p =='?') {
exp->_p++;
trex_expect(exp,':');
expr = trex_newnode(exp,OP_NOCAPEXPR);
}
else
expr = trex_newnode(exp,OP_EXPR);
newn = trex_list(exp);
exp->_nodes[expr].left = newn;
ret = expr;
trex_expect(exp,')');
}
break;
case '[':
exp->_p++;
ret = trex_class(exp);
trex_expect(exp,']');
break;
case TREX_SYMBOL_END_OF_STRING: exp->_p++; ret = trex_newnode(exp,OP_EOL);break;
case TREX_SYMBOL_ANY_CHAR: exp->_p++; ret = trex_newnode(exp,OP_DOT);break;
default:
ret = trex_charnode(exp,TRex_False);
break;
}
{
int op;
TRexBool isgreedy = TRex_False;
unsigned short p0 = 0, p1 = 0;
switch(*exp->_p){
case TREX_SYMBOL_GREEDY_ZERO_OR_MORE: p0 = 0; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break;
case TREX_SYMBOL_GREEDY_ONE_OR_MORE: p0 = 1; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break;
case TREX_SYMBOL_GREEDY_ZERO_OR_ONE: p0 = 0; p1 = 1; exp->_p++; isgreedy = TRex_True; break;
case '{':
exp->_p++;
if(!isdigit(*exp->_p)) trex_error(exp,_SC("number expected"));
p0 = (unsigned short)trex_parsenumber(exp);
/*******************************/
switch(*exp->_p) {
case '}':
p1 = p0; exp->_p++;
break;
case ',':
exp->_p++;
p1 = 0xFFFF;
if(isdigit(*exp->_p)){
p1 = (unsigned short)trex_parsenumber(exp);
}
trex_expect(exp,'}');
break;
default:
trex_error(exp,_SC(", or } expected"));
}
/*******************************/
isgreedy = TRex_True;
break;
}
if(isgreedy) {
int nnode = trex_newnode(exp,OP_GREEDY);
op = OP_GREEDY;
exp->_nodes[nnode].left = ret;
exp->_nodes[nnode].right = ((p0)<<16)|p1;
ret = nnode;
}
}
if((*exp->_p != TREX_SYMBOL_BRANCH) && (*exp->_p != ')') && (*exp->_p != TREX_SYMBOL_GREEDY_ZERO_OR_MORE) && (*exp->_p != TREX_SYMBOL_GREEDY_ONE_OR_MORE) && (*exp->_p != '\0')) {
int nnode = trex_element(exp);
exp->_nodes[ret].next = nnode;
}
return ret;
}
static int trex_list(TRex *exp)
{
int ret=-1,e;
if(*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING) {
exp->_p++;
ret = trex_newnode(exp,OP_BOL);
}
e = trex_element(exp);
if(ret != -1) {
exp->_nodes[ret].next = e;
}
else ret = e;
if(*exp->_p == TREX_SYMBOL_BRANCH) {
int temp,tright;
exp->_p++;
temp = trex_newnode(exp,OP_OR);
exp->_nodes[temp].left = ret;
tright = trex_list(exp);
exp->_nodes[temp].right = tright;
ret = temp;
}
return ret;
}
static TRexBool trex_matchcclass(int cclass,TRexChar c)
{
switch(cclass) {
case 'a': return isalpha(c)?TRex_True:TRex_False;
case 'A': return !isalpha(c)?TRex_True:TRex_False;
case 'w': return (isalnum(c) || c == '_')?TRex_True:TRex_False;
case 'W': return (!isalnum(c) && c != '_')?TRex_True:TRex_False;
case 's': return isspace(c)?TRex_True:TRex_False;
case 'S': return !isspace(c)?TRex_True:TRex_False;
case 'd': return isdigit(c)?TRex_True:TRex_False;
case 'D': return !isdigit(c)?TRex_True:TRex_False;
case 'x': return isxdigit(c)?TRex_True:TRex_False;
case 'X': return !isxdigit(c)?TRex_True:TRex_False;
case 'c': return iscntrl(c)?TRex_True:TRex_False;
case 'C': return !iscntrl(c)?TRex_True:TRex_False;
case 'p': return ispunct(c)?TRex_True:TRex_False;
case 'P': return !ispunct(c)?TRex_True:TRex_False;
case 'l': return islower(c)?TRex_True:TRex_False;
case 'u': return isupper(c)?TRex_True:TRex_False;
}
return TRex_False; /*cannot happen*/
}
static TRexBool trex_matchclass(TRex* exp,TRexNode *node,TRexChar c)
{
do {
switch(node->type) {
case OP_RANGE:
if(c >= node->left && c <= node->right) return TRex_True;
break;
case OP_CCLASS:
if(trex_matchcclass(node->left,c)) return TRex_True;
break;
default:
if(c == node->type)return TRex_True;
}
} while((node->next != -1) && (node = &exp->_nodes[node->next]));
return TRex_False;
}
static const TRexChar *trex_matchnode(TRex* exp,TRexNode *node,const TRexChar *str,TRexNode *next)
{
TRexNodeType type = node->type;
switch(type) {
case OP_GREEDY: {
//TRexNode *greedystop = (node->next != -1) ? &exp->_nodes[node->next] : NULL;
TRexNode *greedystop = NULL;
int p0 = (node->right >> 16)&0x0000FFFF, p1 = node->right&0x0000FFFF, nmaches = 0;
const TRexChar *s=str, *good = str;
if(node->next != -1) {
greedystop = &exp->_nodes[node->next];
}
else {
greedystop = next;
}
while((nmaches == 0xFFFF || nmaches < p1)) {
const TRexChar *stop;
if(!(s = trex_matchnode(exp,&exp->_nodes[node->left],s,greedystop)))
break;
nmaches++;
good=s;
if(greedystop) {
//checks that 0 matches satisfy the expression(if so skips)
//if not would always stop(for instance if is a '?')
if(greedystop->type != OP_GREEDY ||
(greedystop->type == OP_GREEDY && ((greedystop->right >> 16)&0x0000FFFF) != 0))
{
TRexNode *gnext = NULL;
if(greedystop->next != -1) {
gnext = &exp->_nodes[greedystop->next];
}else if(next && next->next != -1){
gnext = &exp->_nodes[next->next];
}
stop = trex_matchnode(exp,greedystop,s,gnext);
if(stop) {
//if satisfied stop it
if(p0 == p1 && p0 == nmaches) break;
else if(nmaches >= p0 && p1 == 0xFFFF) break;
else if(nmaches >= p0 && nmaches <= p1) break;
}
}
}
if(s >= exp->_eol)
break;
}
if(p0 == p1 && p0 == nmaches) return good;
else if(nmaches >= p0 && p1 == 0xFFFF) return good;
else if(nmaches >= p0 && nmaches <= p1) return good;
return NULL;
}
case OP_OR: {
const TRexChar *asd = str;
TRexNode *temp=&exp->_nodes[node->left];
while( (asd = trex_matchnode(exp,temp,asd,NULL)) ) {
if(temp->next != -1)
temp = &exp->_nodes[temp->next];
else
return asd;
}
asd = str;
temp = &exp->_nodes[node->right];
while( (asd = trex_matchnode(exp,temp,asd,NULL)) ) {
if(temp->next != -1)
temp = &exp->_nodes[temp->next];
else
return asd;
}
return NULL;
break;
}
case OP_EXPR:
case OP_NOCAPEXPR:{
TRexNode *n = &exp->_nodes[node->left];
const TRexChar *cur = str;
int capture = -1;
if(node->type != OP_NOCAPEXPR && node->right == exp->_currsubexp) {
capture = exp->_currsubexp;
exp->_matches[capture].begin = cur;
exp->_currsubexp++;
}
do {
TRexNode *subnext = NULL;
if(n->next != -1) {
subnext = &exp->_nodes[n->next];
}else {
subnext = next;
}
if(!(cur = trex_matchnode(exp,n,cur,subnext))) {
if(capture != -1){
exp->_matches[capture].begin = 0;
exp->_matches[capture].len = 0;
}
return NULL;
}
} while((n->next != -1) && (n = &exp->_nodes[n->next]));
if(capture != -1)
exp->_matches[capture].len = cur - exp->_matches[capture].begin;
return cur;
}
case OP_WB:
if((str == exp->_bol && !isspace(*str))
|| (str == exp->_eol && !isspace(*(str-1)))
|| (!isspace(*str) && isspace(*(str+1)))
|| (isspace(*str) && !isspace(*(str+1))) ) {
return (node->left == 'b')?str:NULL;
}
return (node->left == 'b')?NULL:str;
case OP_BOL:
if(str == exp->_bol) return str;
return NULL;
case OP_EOL:
if(str == exp->_eol) return str;
return NULL;
case OP_DOT:{
*str++;
}
return str;
case OP_NCLASS:
case OP_CLASS:
if(trex_matchclass(exp,&exp->_nodes[node->left],*str)?(type == OP_CLASS?TRex_True:TRex_False):(type == OP_NCLASS?TRex_True:TRex_False)) {
*str++;
return str;
}
return NULL;
case OP_CCLASS:
if(trex_matchcclass(node->left,*str)) {
*str++;
return str;
}
return NULL;
default: /* char */
if(*str != node->type) return NULL;
*str++;
return str;
}
return NULL;
}
/* public api */
TRex *trex_compile(const TRexChar *pattern,const TRexChar **error)
{
TRex *exp = (TRex *)malloc(sizeof(TRex));
exp->_eol = exp->_bol = NULL;
exp->_p = pattern;
exp->_nallocated = (int)scstrlen(pattern) * sizeof(TRexChar);
exp->_nodes = (TRexNode *)malloc(exp->_nallocated * sizeof(TRexNode));
exp->_nsize = 0;
exp->_matches = 0;
exp->_nsubexpr = 0;
exp->_first = trex_newnode(exp,OP_EXPR);
exp->_error = error;
exp->_jmpbuf = malloc(sizeof(jmp_buf));
if(setjmp(*((jmp_buf*)exp->_jmpbuf)) == 0) {
int res = trex_list(exp);
exp->_nodes[exp->_first].left = res;
if(*exp->_p!='\0')
trex_error(exp,_SC("unexpected character"));
#ifdef _DEBUG
{
int nsize,i;
TRexNode *t;
nsize = exp->_nsize;
t = &exp->_nodes[0];
scprintf(_SC("\n"));
for(i = 0;i < nsize; i++) {
if(exp->_nodes[i].type>MAX_CHAR)
scprintf(_SC("[%02d] %10s "),i,g_nnames[exp->_nodes[i].type-MAX_CHAR]);
else
scprintf(_SC("[%02d] %10c "),i,exp->_nodes[i].type);
scprintf(_SC("left %02d right %02d next %02d\n"),exp->_nodes[i].left,exp->_nodes[i].right,exp->_nodes[i].next);
}
scprintf(_SC("\n"));
}
#endif
exp->_matches = (TRexMatch *) malloc(exp->_nsubexpr * sizeof(TRexMatch));
memset(exp->_matches,0,exp->_nsubexpr * sizeof(TRexMatch));
}
else{
trex_free(exp);
return NULL;
}
return exp;
}
void trex_free(TRex *exp)
{
if(exp) {
if(exp->_nodes) free(exp->_nodes);
if(exp->_jmpbuf) free(exp->_jmpbuf);
if(exp->_matches) free(exp->_matches);
free(exp);
}
}
TRexBool trex_match(TRex* exp,const TRexChar* text)
{
const TRexChar* res = NULL;
exp->_bol = text;
exp->_eol = text + scstrlen(text);
exp->_currsubexp = 0;
res = trex_matchnode(exp,exp->_nodes,text,NULL);
if(res == NULL || res != exp->_eol)
return TRex_False;
return TRex_True;
}
TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end)
{
const TRexChar *cur = NULL;
int node = exp->_first;
if(text_begin >= text_end) return TRex_False;
exp->_bol = text_begin;
exp->_eol = text_end;
do {
cur = text_begin;
while(node != -1) {
exp->_currsubexp = 0;
cur = trex_matchnode(exp,&exp->_nodes[node],cur,NULL);
if(!cur)
break;
node = exp->_nodes[node].next;
}
*text_begin++;
} while(cur == NULL && text_begin != text_end);
if(cur == NULL)
return TRex_False;
--text_begin;
if(out_begin) *out_begin = text_begin;
if(out_end) *out_end = cur;
return TRex_True;
}
TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end)
{
return trex_searchrange(exp,text,text + scstrlen(text),out_begin,out_end);
}
int trex_getsubexpcount(TRex* exp)
{
return exp->_nsubexpr;
}
TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *subexp)
{
if( n<0 || n >= exp->_nsubexpr) return TRex_False;
*subexp = exp->_matches[n];
return TRex_True;
}

View file

@ -1,70 +0,0 @@
#ifndef _TREX_H_
#define _TREX_H_
/***************************************************************
T-Rex a tiny regular expression library
Copyright (C) 2003-2006 Alberto Demichelis
This software is provided 'as-is', without any express
or implied warranty. In no event will the authors be held
liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for
any purpose, including commercial applications, and to alter
it and redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented;
you must not claim that you wrote the original software.
If you use this software in a product, an acknowledgment
in the product documentation would be appreciated but
is not required.
2. Altered source versions must be plainly marked as such,
and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any
source distribution.
****************************************************************/
#define _UNICODE
#ifdef _UNICODE
#define TRexChar wchar_t
#define MAX_CHAR 0xFFFF
#define _TREXC(c) L##c
#define trex_strlen wcslen
#define trex_printf wprintf
#else
#define TRexChar char
#define MAX_CHAR 0xFF
#define _TREXC(c) (c)
#define trex_strlen strlen
#define trex_printf printf
#endif
#ifndef TREX_API
#define TREX_API extern
#endif
#define TRex_True 1
#define TRex_False 0
typedef unsigned int TRexBool;
typedef struct TRex TRex;
typedef struct {
const TRexChar *begin;
int len;
} TRexMatch;
TREX_API TRex *trex_compile(const TRexChar *pattern,const TRexChar **error);
TREX_API void trex_free(TRex *exp);
TREX_API TRexBool trex_match(TRex* exp,const TRexChar* text);
TREX_API TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end);
TREX_API TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end);
TREX_API int trex_getsubexpcount(TRex* exp);
TREX_API TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *subexp);
#endif