xmlregexp

xmlregexp - regular expressions handling

basic API for libxml regular expressions handling used for XML Schemas and validation.

Author(s): Daniel Veillard

Synopsis

typedef struct _xmlExpCtxt xmlExpCtxt;
typedef xmlExpCtxt * xmlExpCtxtPtr;
typedef struct _xmlExpNode xmlExpNode;
typedef xmlExpNode * xmlExpNodePtr;
typedef enum xmlExpNodeType;
typedef struct _xmlRegExecCtxt xmlRegExecCtxt;
typedef xmlRegExecCtxt * xmlRegExecCtxtPtr;
typedef struct _xmlRegexp xmlRegexp;
typedef xmlRegexp * xmlRegexpPtr;
int	xmlExpCtxtNbCons		(xmlExpCtxtPtr ctxt);
int	xmlExpCtxtNbNodes		(xmlExpCtxtPtr ctxt);
void	xmlExpDump			(xmlBufferPtr buf, 
xmlExpNodePtr expr); xmlExpNodePtr xmlExpExpDerive (xmlExpCtxtPtr ctxt,
xmlExpNodePtr exp,
xmlExpNodePtr sub); void xmlExpFree (xmlExpCtxtPtr ctxt,
xmlExpNodePtr exp); void xmlExpFreeCtxt (xmlExpCtxtPtr ctxt); int xmlExpGetLanguage (xmlExpCtxtPtr ctxt,
xmlExpNodePtr exp,
const xmlChar ** langList,
int len); int xmlExpGetStart (xmlExpCtxtPtr ctxt,
xmlExpNodePtr exp,
const xmlChar ** tokList,
int len); int xmlExpIsNillable (xmlExpNodePtr exp); int xmlExpMaxToken (xmlExpNodePtr expr); xmlExpNodePtr xmlExpNewAtom (xmlExpCtxtPtr ctxt,
const xmlChar * name,
int len); xmlExpCtxtPtr xmlExpNewCtxt (int maxNodes,
xmlDictPtr dict); xmlExpNodePtr xmlExpNewOr (xmlExpCtxtPtr ctxt,
xmlExpNodePtr left,
xmlExpNodePtr right); xmlExpNodePtr xmlExpNewRange (xmlExpCtxtPtr ctxt,
xmlExpNodePtr subset,
int min,
int max); xmlExpNodePtr xmlExpNewSeq (xmlExpCtxtPtr ctxt,
xmlExpNodePtr left,
xmlExpNodePtr right); xmlExpNodePtr xmlExpParse (xmlExpCtxtPtr ctxt,
const char * expr); void xmlExpRef (xmlExpNodePtr exp); xmlExpNodePtr xmlExpStringDerive (xmlExpCtxtPtr ctxt,
xmlExpNodePtr exp,
const xmlChar * str,
int len); int xmlExpSubsume (xmlExpCtxtPtr ctxt,
xmlExpNodePtr exp,
xmlExpNodePtr sub); typedef void xmlRegExecCallbacks (xmlRegExecCtxtPtr exec,
const xmlChar * token,
void * transdata,
void * inputdata); int xmlRegExecErrInfo (xmlRegExecCtxtPtr exec,
const xmlChar ** string,
int * nbval,
int * nbneg,
xmlChar ** values,
int * terminal); int xmlRegExecNextValues (xmlRegExecCtxtPtr exec,
int * nbval,
int * nbneg,
xmlChar ** values,
int * terminal); int xmlRegExecPushString (xmlRegExecCtxtPtr exec,
const xmlChar * value,
void * data); int xmlRegExecPushString2 (xmlRegExecCtxtPtr exec,
const xmlChar * value,
const xmlChar * value2,
void * data); void xmlRegFreeExecCtxt (xmlRegExecCtxtPtr exec); void xmlRegFreeRegexp (xmlRegexpPtr regexp); xmlRegExecCtxtPtr xmlRegNewExecCtxt (xmlRegexpPtr comp,
xmlRegExecCallbacks callback,
void * data); xmlRegexpPtr xmlRegexpCompile (const xmlChar * regexp); int xmlRegexpExec (xmlRegexpPtr comp,
const xmlChar * content); int xmlRegexpIsDeterminist (xmlRegexpPtr comp); void xmlRegexpPrint (FILE * output,
xmlRegexpPtr regexp);

Description

Details

Structure xmlExpCtxt

struct _xmlExpCtxt {
The content of this structure is not made public by the API.
} xmlExpCtxt;


Typedef xmlExpCtxtPtr

xmlExpCtxt * xmlExpCtxtPtr;


Structure xmlExpNode

struct _xmlExpNode {
The content of this structure is not made public by the API.
} xmlExpNode;


Typedef xmlExpNodePtr

xmlExpNode * xmlExpNodePtr;



Structure xmlRegExecCtxt

struct _xmlRegExecCtxt {
The content of this structure is not made public by the API.
} xmlRegExecCtxt;


Typedef xmlRegExecCtxtPtr

xmlRegExecCtxt * xmlRegExecCtxtPtr;

A libxml progressive regular expression evaluation context


Structure xmlRegexp

struct _xmlRegexp {
The content of this structure is not made public by the API.
} xmlRegexp;


Typedef xmlRegexpPtr

xmlRegexp * xmlRegexpPtr;

A libxml regular expression, they can actually be far more complex thank the POSIX regex expressions.


Function type xmlRegExecCallbacks

void	xmlRegExecCallbacks		(xmlRegExecCtxtPtr exec, 
const xmlChar * token,
void * transdata,
void * inputdata)

Callback function when doing a transition in the automata

exec: the regular expression context
token: the current token string
transdata: transition data
inputdata: input data

Variable emptyExp

xmlExpNodePtr emptyExp;


Variable forbiddenExp

xmlExpNodePtr forbiddenExp;


xmlExpCtxtNbCons ()

int	xmlExpCtxtNbCons		(xmlExpCtxtPtr ctxt)

Debugging facility provides the number of allocated nodes over lifetime

ctxt: an expression context
Returns: the number of nodes ever allocated or -1 in case of error

xmlExpCtxtNbNodes ()

int	xmlExpCtxtNbNodes		(xmlExpCtxtPtr ctxt)

Debugging facility provides the number of allocated nodes at a that point

ctxt: an expression context
Returns: the number of nodes in use or -1 in case of error

xmlExpDump ()

void	xmlExpDump			(xmlBufferPtr buf, 
xmlExpNodePtr expr)

Serialize the expression as compiled to the buffer

buf: a buffer to receive the output
expr: the compiled expression

xmlExpExpDerive ()

xmlExpNodePtr	xmlExpExpDerive		(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr exp,
xmlExpNodePtr sub)

Evaluates the expression resulting from @exp consuming a sub expression @sub Based on algebraic derivation and sometimes direct Brzozowski derivation it usually takes less than linear time and can handle expressions generating infinite languages.

ctxt: the expressions context
exp: the englobing expression
sub: the subexpression
Returns: the resulting expression or NULL in case of internal error, the result must be freed

xmlExpFree ()

void	xmlExpFree			(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr exp)

Dereference the expression

ctxt: the expression context
exp: the expression

xmlExpFreeCtxt ()

void	xmlExpFreeCtxt			(xmlExpCtxtPtr ctxt)

Free an expression context

ctxt: an expression context

xmlExpGetLanguage ()

int	xmlExpGetLanguage		(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr exp,
const xmlChar ** langList,
int len)

Find all the strings used in @exp and store them in @list

ctxt: the expression context
exp: the expression
langList: where to store the tokens
len: the allocated length of @list
Returns: the number of unique strings found, -1 in case of errors and -2 if there is more than @len strings

xmlExpGetStart ()

int	xmlExpGetStart			(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr exp,
const xmlChar ** tokList,
int len)

Find all the strings that appears at the start of the languages accepted by @exp and store them in @list. E.g. for (a, b) | c it will return the list [a, c]

ctxt: the expression context
exp: the expression
tokList: where to store the tokens
len: the allocated length of @list
Returns: the number of unique strings found, -1 in case of errors and -2 if there is more than @len strings

xmlExpIsNillable ()

int	xmlExpIsNillable		(xmlExpNodePtr exp)

Finds if the expression is nillable, i.e. if it accepts the empty sequence

exp: the expression
Returns: 1 if nillable, 0 if not and -1 in case of error

xmlExpMaxToken ()

int	xmlExpMaxToken			(xmlExpNodePtr expr)

Indicate the maximum number of input a expression can accept

expr: a compiled expression
Returns: the maximum length or -1 in case of error

xmlExpNewAtom ()

xmlExpNodePtr	xmlExpNewAtom		(xmlExpCtxtPtr ctxt, 
const xmlChar * name,
int len)

Get the atom associated to this name from that context

ctxt: the expression context
name: the atom name
len: the atom name length in byte (or -1);
Returns: the node or NULL in case of error

xmlExpNewCtxt ()

xmlExpCtxtPtr	xmlExpNewCtxt		(int maxNodes, 
xmlDictPtr dict)

Creates a new context for manipulating expressions

maxNodes: the maximum number of nodes
dict: optional dictionary to use internally
Returns: the context or NULL in case of error

xmlExpNewOr ()

xmlExpNodePtr	xmlExpNewOr		(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr left,
xmlExpNodePtr right)

Get the atom associated to the choice @left | @right Note that @left and @right are consumed in the operation, to keep an handle on them use xmlExpRef() and use xmlExpFree() to release them, this is true even in case of failure (unless ctxt == NULL).

ctxt: the expression context
left: left expression
right: right expression
Returns: the node or NULL in case of error

xmlExpNewRange ()

xmlExpNodePtr	xmlExpNewRange		(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr subset,
int min,
int max)

Get the atom associated to the range (@subset){@min, @max} Note that @subset is consumed in the operation, to keep an handle on it use xmlExpRef() and use xmlExpFree() to release it, this is true even in case of failure (unless ctxt == NULL).

ctxt: the expression context
subset: the expression to be repeated
min: the lower bound for the repetition
max: the upper bound for the repetition, -1 means infinite
Returns: the node or NULL in case of error

xmlExpNewSeq ()

xmlExpNodePtr	xmlExpNewSeq		(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr left,
xmlExpNodePtr right)

Get the atom associated to the sequence @left , @right Note that @left and @right are consumed in the operation, to keep an handle on them use xmlExpRef() and use xmlExpFree() to release them, this is true even in case of failure (unless ctxt == NULL).

ctxt: the expression context
left: left expression
right: right expression
Returns: the node or NULL in case of error

xmlExpParse ()

xmlExpNodePtr	xmlExpParse		(xmlExpCtxtPtr ctxt, 
const char * expr)

Minimal parser for regexps, it understand the following constructs - string terminals - choice operator | - sequence operator , - subexpressions (...) - usual cardinality operators + * and ? - finite sequences { min, max } - infinite sequences { min, * } There is minimal checkings made especially no checking on strings values

ctxt: the expressions context
expr: the 0 terminated string
Returns: a new expression or NULL in case of failure

xmlExpRef ()

void	xmlExpRef			(xmlExpNodePtr exp)

Increase the reference count of the expression

exp: the expression

xmlExpStringDerive ()

xmlExpNodePtr	xmlExpStringDerive	(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr exp,
const xmlChar * str,
int len)

Do one step of Brzozowski derivation of the expression @exp with respect to the input string

ctxt: the expression context
exp: the expression
str: the string
len: the string len in bytes if available
Returns: the resulting expression or NULL in case of internal error

xmlExpSubsume ()

int	xmlExpSubsume			(xmlExpCtxtPtr ctxt, 
xmlExpNodePtr exp,
xmlExpNodePtr sub)

Check whether @exp accepts all the languages accepted by @sub the input being a subexpression.

ctxt: the expressions context
exp: the englobing expression
sub: the subexpression
Returns: 1 if true 0 if false and -1 in case of failure.

xmlRegExecErrInfo ()

int	xmlRegExecErrInfo		(xmlRegExecCtxtPtr exec, 
const xmlChar ** string,
int * nbval,
int * nbneg,
xmlChar ** values,
int * terminal)

Extract error information from the regexp execution, the parameter @string will be updated with the value pushed and not accepted, the parameter @values must point to an array of @nbval string pointers on return nbval will contain the number of possible strings in that state and the @values array will be updated with them. The string values

exec: a regexp execution context generating an error
string: return value for the error string
nbval: pointer to the number of accepted values IN/OUT
nbneg: return number of negative transitions
values: pointer to the array of acceptable values
terminal: return value if this was a terminal state
Returns: will be freed with the @exec context and don't need to be deallocated. Returns: 0 in case of success or -1 in case of error.

xmlRegExecNextValues ()

int	xmlRegExecNextValues		(xmlRegExecCtxtPtr exec, 
int * nbval,
int * nbneg,
xmlChar ** values,
int * terminal)

Extract information from the regexp execution, the parameter @values must point to an array of @nbval string pointers on return nbval will contain the number of possible strings in that state and the @values array will be updated with them. The string values

exec: a regexp execution context
nbval: pointer to the number of accepted values IN/OUT
nbneg: return number of negative transitions
values: pointer to the array of acceptable values
terminal: return value if this was a terminal state
Returns: will be freed with the @exec context and don't need to be deallocated. Returns: 0 in case of success or -1 in case of error.

xmlRegExecPushString ()

int	xmlRegExecPushString		(xmlRegExecCtxtPtr exec, 
const xmlChar * value,
void * data)

Push one input token in the execution context

exec: a regexp execution context or NULL to indicate the end
value: a string token input
data: data associated to the token to reuse in callbacks
Returns: 1 if the regexp reached a final state, 0 if non-final, and a negative value in case of error.

xmlRegExecPushString2 ()

int	xmlRegExecPushString2		(xmlRegExecCtxtPtr exec, 
const xmlChar * value,
const xmlChar * value2,
void * data)

Push one input token in the execution context

exec: a regexp execution context or NULL to indicate the end
value: the first string token input
value2: the second string token input
data: data associated to the token to reuse in callbacks
Returns: 1 if the regexp reached a final state, 0 if non-final, and a negative value in case of error.

xmlRegFreeExecCtxt ()

void	xmlRegFreeExecCtxt		(xmlRegExecCtxtPtr exec)

Free the structures associated to a regular expression evaluation context.

exec: a regular expression evaluation context

xmlRegFreeRegexp ()

void	xmlRegFreeRegexp		(xmlRegexpPtr regexp)

Free a regexp

regexp: the regexp

xmlRegNewExecCtxt ()

xmlRegExecCtxtPtr	xmlRegNewExecCtxt	(xmlRegexpPtr comp, 
xmlRegExecCallbacks callback,
void * data)

Build a context used for progressive evaluation of a regexp.

comp: a precompiled regular expression
callback: a callback function used for handling progresses in the automata matching phase
data: the context data associated to the callback in this context
Returns: the new context

xmlRegexpCompile ()

xmlRegexpPtr	xmlRegexpCompile	(const xmlChar * regexp)

Parses a regular expression conforming to XML Schemas Part 2 Datatype Appendix F and builds an automata suitable for testing strings against that regular expression

regexp: a regular expression string
Returns: the compiled expression or NULL in case of error

xmlRegexpExec ()

int	xmlRegexpExec			(xmlRegexpPtr comp, 
const xmlChar * content)

Check if the regular expression generates the value

comp: the compiled regular expression
content: the value to check against the regular expression
Returns: 1 if it matches, 0 if not and a negative value in case of error

xmlRegexpIsDeterminist ()

int	xmlRegexpIsDeterminist		(xmlRegexpPtr comp)

Check if the regular expression is determinist

comp: the compiled regular expression
Returns: 1 if it yes, 0 if not and a negative value in case of error

xmlRegexpPrint ()

void	xmlRegexpPrint			(FILE * output, 
xmlRegexpPtr regexp)

Print the content of the compiled regular expression

output: the file for the output debug
regexp: the compiled regexp