Master commit of OpenFace.

This commit is contained in:
unknown
2016-04-28 15:40:36 -04:00
parent 5346d303ab
commit 57e58a6949
4406 changed files with 1441342 additions and 0 deletions

View File

@@ -0,0 +1,675 @@
// Copyright (C) 2005 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CPP_TOKENIZER_KERNEl_1_
#define DLIB_CPP_TOKENIZER_KERNEl_1_
#include <string>
#include <iostream>
#include "cpp_tokenizer_kernel_abstract.h"
#include "../algs.h"
namespace dlib
{
namespace cpp_tok_kernel_1_helper
{
struct token_text_pair
{
std::string token;
int type;
};
}
template <
typename tok,
typename queue,
typename set
>
class cpp_tokenizer_kernel_1
{
/*!
REQUIREMENTS ON tok
tok must be an implementation of tokenizer/tokenizer_kernel_abstract.h
REQUIREMENTS ON queue
queue must be an implementation of queue/queue_kernel_abstract.h
and must have T==cpp_tok_kernel_1_helper::token_text_pair
REQUIREMENTS ON set
set must be an implemention of set/set_kernel_abstract.h or
hash_set/hash_set_kernel_abstract.h and must have T==std::string.
INITIAL VALUE
- keywords == a set of all the C++ keywords
- tokenizer.stream_is_set() == false
- buffer.size() == 0
- tokenizer.get_identifier_head() == "$_" + tokenizer.lowercase_letters() +
tokenizer.uppercase_letters()
- tokenizer.get_identifier_body() == "$_" + tokenizer.lowercase_letters() +
tokenizer.uppercase_letters() + tokenizer.numbers()
- have_peeked == false
CONVENTION
- tokenizer.stream_is_set() == stream_is_set()
- tokenizer.get_stream() == get_stream()
- keywords == a set of all the C++ keywords
- tokenizer.get_identifier_head() == "$_" + tokenizer.lowercase_letters() +
tokenizer.uppercase_letters()
- tokenizer.get_identifier_body() == "$_" + tokenizer.lowercase_letters() +
tokenizer.uppercase_letters() + tokenizer.numbers()
- buffer == a queue of tokens. This is where we put tokens
we gathered early due to looking ahead.
- if (have_peeked) then
- next_token == the next token to be returned from get_token()
- next_type == the type of token in peek_token
!*/
typedef cpp_tok_kernel_1_helper::token_text_pair token_text_pair;
public:
enum
{
END_OF_FILE,
KEYWORD,
COMMENT,
SINGLE_QUOTED_TEXT,
DOUBLE_QUOTED_TEXT,
IDENTIFIER,
OTHER,
NUMBER,
WHITE_SPACE
};
cpp_tokenizer_kernel_1 (
);
virtual ~cpp_tokenizer_kernel_1 (
);
void clear(
);
void set_stream (
std::istream& in
);
bool stream_is_set (
) const;
std::istream& get_stream (
) const;
void get_token (
int& type,
std::string& token
);
int peek_type (
) const;
const std::string& peek_token (
) const;
void swap (
cpp_tokenizer_kernel_1<tok,queue,set>& item
);
private:
void buffer_token(
int type,
const std::string& token
)
/*!
ensures
- stores the token and its type into buffer
!*/
{
token_text_pair temp;
temp.token = token;
temp.type = type;
buffer.enqueue(temp);
}
void buffer_token(
int type,
char token
)
/*!
ensures
- stores the token and its type into buffer
!*/
{
token_text_pair temp;
temp.token = token;
temp.type = type;
buffer.enqueue(temp);
}
// restricted functions
cpp_tokenizer_kernel_1(const cpp_tokenizer_kernel_1<tok,queue,set>&); // copy constructor
cpp_tokenizer_kernel_1<tok,queue,set>& operator=(const cpp_tokenizer_kernel_1<tok,queue,set>&); // assignment operator
// data members
set keywords;
queue buffer;
tok tokenizer;
mutable std::string next_token;
mutable int next_type;
mutable bool have_peeked;
};
template <
typename tok,
typename queue,
typename set
>
inline void swap (
cpp_tokenizer_kernel_1<tok,queue,set>& a,
cpp_tokenizer_kernel_1<tok,queue,set>& b
) { a.swap(b); }
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// member function definitions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
cpp_tokenizer_kernel_1<tok,queue,set>::
cpp_tokenizer_kernel_1(
) :
have_peeked(false)
{
// add C++ keywords to keywords
std::string temp;
temp = "#include"; keywords.add(temp);
temp = "__asm"; keywords.add(temp);
temp = "_asm"; keywords.add(temp);
temp = "if"; keywords.add(temp);
temp = "int"; keywords.add(temp);
temp = "else"; keywords.add(temp);
temp = "template"; keywords.add(temp);
temp = "void"; keywords.add(temp);
temp = "false"; keywords.add(temp);
temp = "class"; keywords.add(temp);
temp = "public"; keywords.add(temp);
temp = "while"; keywords.add(temp);
temp = "bool"; keywords.add(temp);
temp = "new"; keywords.add(temp);
temp = "delete"; keywords.add(temp);
temp = "true"; keywords.add(temp);
temp = "typedef"; keywords.add(temp);
temp = "const"; keywords.add(temp);
temp = "virtual"; keywords.add(temp);
temp = "inline"; keywords.add(temp);
temp = "for"; keywords.add(temp);
temp = "break"; keywords.add(temp);
temp = "struct"; keywords.add(temp);
temp = "float"; keywords.add(temp);
temp = "case"; keywords.add(temp);
temp = "enum"; keywords.add(temp);
temp = "this"; keywords.add(temp);
temp = "typeid"; keywords.add(temp);
temp = "double"; keywords.add(temp);
temp = "char"; keywords.add(temp);
temp = "typename"; keywords.add(temp);
temp = "signed"; keywords.add(temp);
temp = "friend"; keywords.add(temp);
temp = "wint_t"; keywords.add(temp);
temp = "default"; keywords.add(temp);
temp = "asm"; keywords.add(temp);
temp = "reinterpret_cast"; keywords.add(temp);
temp = "#define"; keywords.add(temp);
temp = "do"; keywords.add(temp);
temp = "continue"; keywords.add(temp);
temp = "auto"; keywords.add(temp);
temp = "unsigned"; keywords.add(temp);
temp = "size_t"; keywords.add(temp);
temp = "#undef"; keywords.add(temp);
temp = "#pragma"; keywords.add(temp);
temp = "namespace"; keywords.add(temp);
temp = "private"; keywords.add(temp);
temp = "#endif"; keywords.add(temp);
temp = "catch"; keywords.add(temp);
temp = "#else"; keywords.add(temp);
temp = "register"; keywords.add(temp);
temp = "volatile"; keywords.add(temp);
temp = "const_cast"; keywords.add(temp);
temp = "#end"; keywords.add(temp);
temp = "mutable"; keywords.add(temp);
temp = "static_cast"; keywords.add(temp);
temp = "wchar_t"; keywords.add(temp);
temp = "#if"; keywords.add(temp);
temp = "protected"; keywords.add(temp);
temp = "throw"; keywords.add(temp);
temp = "using"; keywords.add(temp);
temp = "dynamic_cast"; keywords.add(temp);
temp = "#ifdef"; keywords.add(temp);
temp = "return"; keywords.add(temp);
temp = "short"; keywords.add(temp);
temp = "#error"; keywords.add(temp);
temp = "#line"; keywords.add(temp);
temp = "explicit"; keywords.add(temp);
temp = "union"; keywords.add(temp);
temp = "#ifndef"; keywords.add(temp);
temp = "try"; keywords.add(temp);
temp = "sizeof"; keywords.add(temp);
temp = "goto"; keywords.add(temp);
temp = "long"; keywords.add(temp);
temp = "#elif"; keywords.add(temp);
temp = "static"; keywords.add(temp);
temp = "operator"; keywords.add(temp);
temp = "switch"; keywords.add(temp);
temp = "extern"; keywords.add(temp);
// set the tokenizer's IDENTIFIER token for C++ identifiers
tokenizer.set_identifier_token(
"$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters(),
"$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters() +
tokenizer.numbers()
);
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
cpp_tokenizer_kernel_1<tok,queue,set>::
~cpp_tokenizer_kernel_1 (
)
{
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
void cpp_tokenizer_kernel_1<tok,queue,set>::
clear(
)
{
tokenizer.clear();
buffer.clear();
have_peeked = false;
// set the tokenizer's IDENTIFIER token for C++ identifiers
tokenizer.set_identifier_token(
"$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters(),
"$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters() +
tokenizer.numbers()
);
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
void cpp_tokenizer_kernel_1<tok,queue,set>::
set_stream (
std::istream& in
)
{
tokenizer.set_stream(in);
buffer.clear();
have_peeked = false;
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
bool cpp_tokenizer_kernel_1<tok,queue,set>::
stream_is_set (
) const
{
return tokenizer.stream_is_set();
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
std::istream& cpp_tokenizer_kernel_1<tok,queue,set>::
get_stream (
) const
{
return tokenizer.get_stream();
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
void cpp_tokenizer_kernel_1<tok,queue,set>::
get_token (
int& type,
std::string& token
)
{
using namespace std;
if (!have_peeked)
{
if (buffer.size() > 0)
{
// just return what is in the buffer
token_text_pair temp;
buffer.dequeue(temp);
type = temp.type;
token = temp.token;
return;
}
tokenizer.get_token(type,token);
switch (type)
{
case tok::END_OF_FILE:
{
type = END_OF_FILE;
} break;
case tok::END_OF_LINE:
case tok::WHITE_SPACE:
{
type = tokenizer.peek_type();
if (type == tok::END_OF_LINE || type == tok::WHITE_SPACE)
{
std::string temp;
do
{
tokenizer.get_token(type,temp);
token += temp;
type = tokenizer.peek_type();
}while (type == tok::END_OF_LINE || type == tok::WHITE_SPACE);
}
type = WHITE_SPACE;
} break;
case tok::NUMBER:
{
// this could be a hex number such as 0xa33. we should check for this.
if (tokenizer.peek_type() == tok::IDENTIFIER && token == "0" &&
(tokenizer.peek_token()[0] == 'x' || tokenizer.peek_token()[0] == 'X'))
{
// this is a hex number so accumulate all the numbers and identifiers that follow
// because they have to be part of the number
std::string temp;
tokenizer.get_token(type,temp);
token = "0" + temp;
// get the rest of the hex number
while (tokenizer.peek_type() == tok::IDENTIFIER ||
tokenizer.peek_type() == tok::NUMBER
)
{
tokenizer.get_token(type,temp);
token += temp;
}
}
// or this could be a floating point value or something with an 'e' or 'E' in it.
else if ((tokenizer.peek_type() == tok::CHAR && tokenizer.peek_token()[0] == '.') ||
(tokenizer.peek_type() == tok::IDENTIFIER && std::tolower(tokenizer.peek_token()[0]) == 'e'))
{
std::string temp;
tokenizer.get_token(type,temp);
token += temp;
// now get the rest of the floating point value
while (tokenizer.peek_type() == tok::IDENTIFIER ||
tokenizer.peek_type() == tok::NUMBER
)
{
tokenizer.get_token(type,temp);
token += temp;
}
}
type = NUMBER;
} break;
case tok::IDENTIFIER:
{
if (keywords.is_member(token))
{
type = KEYWORD;
}
else
{
type = IDENTIFIER;
}
} break;
case tok::CHAR:
type = OTHER;
switch (token[0])
{
case '#':
{
// this might be a preprocessor keyword so we should check the
// next token
if (tokenizer.peek_type() == tok::IDENTIFIER &&
keywords.is_member('#'+tokenizer.peek_token()))
{
tokenizer.get_token(type,token);
token = '#' + token;
type = KEYWORD;
}
else
{
token = '#';
type = OTHER;
}
}
break;
case '"':
{
string temp;
tokenizer.get_token(type,token);
while (type != tok::END_OF_FILE)
{
// if this is the end of the quoted string
if (type == tok::CHAR && token[0] == '"' &&
(temp.size() == 0 || temp[temp.size()-1] != '\\' ||
(temp.size() > 1 && temp[temp.size()-2] == '\\') ))
{
buffer_token(DOUBLE_QUOTED_TEXT,temp);
buffer_token(OTHER,"\"");
break;
}
else
{
temp += token;
}
tokenizer.get_token(type,token);
}
type = OTHER;
token = '"';
} break;
case '\'':
{
string temp;
tokenizer.get_token(type,token);
if (type == tok::CHAR && token[0] == '\\')
{
temp += '\\';
tokenizer.get_token(type,token);
}
temp += token;
buffer_token(SINGLE_QUOTED_TEXT,temp);
// The next character should be a ' so take it out and put it in
// the buffer.
tokenizer.get_token(type,token);
buffer_token(OTHER,token);
type = OTHER;
token = '\'';
} break;
case '/':
{
// look ahead to see if this is the start of a comment
if (tokenizer.peek_type() == tok::CHAR)
{
if (tokenizer.peek_token()[0] == '/')
{
tokenizer.get_token(type,token);
// this is the start of a line comment
token = "//";
string temp;
tokenizer.get_token(type,temp);
while (type != tok::END_OF_FILE)
{
// if this is the end of the comment
if (type == tok::END_OF_LINE &&
token[token.size()-1] != '\\' )
{
token += '\n';
break;
}
else
{
token += temp;
}
tokenizer.get_token(type,temp);
}
type = COMMENT;
}
else if (tokenizer.peek_token()[0] == '*')
{
tokenizer.get_token(type,token);
// this is the start of a block comment
token = "/*";
string temp;
tokenizer.get_token(type,temp);
while (type != tok::END_OF_FILE)
{
// if this is the end of the comment
if (type == tok::CHAR && temp[0] == '/' &&
token[token.size()-1] == '*')
{
token += '/';
break;
}
else
{
token += temp;
}
tokenizer.get_token(type,temp);
}
type = COMMENT;
}
}
} break;
default:
break;
} // switch (token[0])
} // switch (type)
}
else
{
// if we get this far it means we have peeked so we should
// return the peek data.
type = next_type;
token = next_token;
have_peeked = false;
}
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
int cpp_tokenizer_kernel_1<tok,queue,set>::
peek_type (
) const
{
const_cast<cpp_tokenizer_kernel_1<tok,queue,set>*>(this)->get_token(next_type,next_token);
have_peeked = true;
return next_type;
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
const std::string& cpp_tokenizer_kernel_1<tok,queue,set>::
peek_token (
) const
{
const_cast<cpp_tokenizer_kernel_1<tok,queue,set>*>(this)->get_token(next_type,next_token);
have_peeked = true;
return next_token;
}
// ----------------------------------------------------------------------------------------
template <
typename tok,
typename queue,
typename set
>
void cpp_tokenizer_kernel_1<tok,queue,set>::
swap (
cpp_tokenizer_kernel_1& item
)
{
tokenizer.swap(item.tokenizer);
buffer.swap(item.buffer);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_CPP_TOKENIZER_KERNEl_1_

View File

@@ -0,0 +1,224 @@
// Copyright (C) 2005 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_CPP_TOKENIZER_KERNEl_ABSTRACT_
#ifdef DLIB_CPP_TOKENIZER_KERNEl_ABSTRACT_
#include <string>
#include <ioswfd>
namespace dlib
{
class cpp_tokenizer
{
/*!
INITIAL VALUE
stream_is_set() == false
WHAT THIS OBJECT REPRESENTS
This object represents a simple tokenizer for C++ source code.
BUFFERING
This object is allowed to buffer data from the input stream.
Thus if you clear it or switch streams (via calling set_stream())
any buffered data will be lost.
TOKENS
When picking out tokens the cpp_tokenizer will always extract the
longest token it can. For example, if faced with the string
"AAA" it will consider the three As to be a single IDENTIFIER
token not three smaller IDENTIFIER tokens.
Also note that no characters in the input stream are discarded.
They will all be returned in the text of some token.
Additionally, each character will never be returned more than once.
This means that if you concatenated all returned tokens it would exactly
reproduce the contents of the input stream.
The tokens are defined as follows:
END_OF_FILE
This token represents the end of file. It doesn't have any
actual characters associated with it.
KEYWORD
This token matches a C++ keyword. (This includes the preprocessor
directives).
COMMENT
This token matches a C++ comment.
SINGLE_QUOTED_TEXT
This token matches the text of any single quoted literal.
For example, 'a' would be a match and the text of this token
would be the single character a.
DOUBLE_QUOTED_TEXT
This token matches the text of any double quoted string.
For example, "C++" would be a match and the text of this token
would be the three character string C++.
WHITE_SPACE
This is a multi character token. It is defined as a sequence of
one or more spaces, carrage returns, newlines, and tabs. I.e. It
is composed of characters from the following string " \r\n\t".
IDENTIFIER
This token matches any C++ identifier that isn't matched by any
of the above tokens. (A C++ identifier being a string matching
the regular expression [_$a-zA-Z][_$a-zA-Z0-9]*).
NUMBER
This token matches any C++ numerical constant.
OTHER
This matches anything that isn't part of one of the above tokens.
It is always a single character.
!*/
public:
enum
{
END_OF_FILE,
KEYWORD,
COMMENT,
SINGLE_QUOTED_TEXT,
DOUBLE_QUOTED_TEXT,
IDENTIFIER,
OTHER,
NUMBER,
WHITE_SPACE
};
cpp_tokenizer (
);
/*!
ensures
- #*this is properly initialized
throws
- std::bad_alloc
!*/
virtual ~cpp_tokenizer (
);
/*!
ensures
- any resources associated with *this have been released
!*/
void clear(
);
/*!
ensures
- #*this has its initial value
throws
- std::bad_alloc
If this exception is thrown then #*this is unusable
until clear() is called and succeeds.
!*/
void set_stream (
std::istream& in
);
/*!
ensures
- #*this will read data from in and tokenize it
- #stream_is_set() == true
- #get_stream() == in
!*/
bool stream_is_set (
) const;
/*!
ensures
- returns true if a stream has been associated with *this by calling
set_stream()
!*/
std::istream& get_stream (
) const;
/*!
requires
- stream_is_set() == true
ensures
- returns a reference to the istream object that *this is reading
from.
!*/
void get_token (
int& type,
std::string& token
);
/*!
requires
- stream_is_set() == true
ensures
- #token == the next token from the input stream get_stream()
- #type == the type of the token in #token
throws
- bad_alloc
If this exception is thrown then the call to this function will
have no effect on *this but the values of #type and #token will be
undefined. Additionally, some characters may have been read
from the stream get_stream() and lost.
!*/
int peek_type (
) const;
/*!
requires
- stream_is_set() == true
ensures
- returns the type of the token that will be returned from
the next call to get_token()
throws
- bad_alloc
If this exception is thrown then the call to this function will
have no effect on *this. However, some characters may have been
read from the stream get_stream() and lost.
!*/
const std::string& peek_token (
) const;
/*!
requires
- stream_is_set() == true
ensures
- returns the text of the token that will be returned from
the next call to get_token()
throws
- bad_alloc
If this exception is thrown then the call to this function will
have no effect on *this. However, some characters may have been
read from the stream get_stream() and lost.
!*/
void swap (
cpp_tokenizer& item
);
/*!
ensures
- swaps *this and item
!*/
private:
// restricted functions
cpp_tokenizer(const cpp_tokenizer&); // copy constructor
cpp_tokenizer& operator=(const cpp_tokenizer&); // assignment operator
};
inline void swap (
cpp_tokenizer& a,
cpp_tokenizer& b
) { a.swap(b); }
/*!
provides a global swap function
!*/
}
#endif // DLIB_CPP_TOKENIZER_KERNEl_ABSTRACT_

View File

@@ -0,0 +1,137 @@
// Copyright (C) 2003 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CPP_TOKENIZER_KERNEl_C_
#define DLIB_CPP_TOKENIZER_KERNEl_C_
#include "cpp_tokenizer_kernel_abstract.h"
#include "../assert.h"
#include <string>
#include <iostream>
namespace dlib
{
template <
typename tokenizer
>
class cpp_tokenizer_kernel_c : public tokenizer
{
public:
std::istream& get_stream (
) const;
void get_token (
int& type,
std::string& token
);
int peek_type (
) const;
const std::string& peek_token (
) const;
};
template <
typename tokenizer
>
inline void swap (
cpp_tokenizer_kernel_c<tokenizer>& a,
cpp_tokenizer_kernel_c<tokenizer>& b
) { a.swap(b); }
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// member function definitions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename tokenizer
>
std::istream& cpp_tokenizer_kernel_c<tokenizer>::
get_stream (
) const
{
// make sure requires clause is not broken
DLIB_CASSERT( this->stream_is_set() == true,
"\tstd::istream& cpp_tokenizer::get_stream()"
<< "\n\tyou must set a stream for this object before you can get it"
<< "\n\tthis: " << this
);
// call the real function
return tokenizer::get_stream();
}
// ----------------------------------------------------------------------------------------
template <
typename tokenizer
>
const std::string& cpp_tokenizer_kernel_c<tokenizer>::
peek_token (
) const
{
// make sure requires clause is not broken
DLIB_CASSERT( this->stream_is_set() == true,
"\tconst std::string& cpp_tokenizer::peek_token()"
<< "\n\tyou must set a stream for this object before you can peek at what it contains"
<< "\n\tthis: " << this
);
// call the real function
return tokenizer::peek_token();
}
// ----------------------------------------------------------------------------------------
template <
typename tokenizer
>
int cpp_tokenizer_kernel_c<tokenizer>::
peek_type (
) const
{
// make sure requires clause is not broken
DLIB_CASSERT( this->stream_is_set() == true,
"\tint cpp_tokenizer::peek_type()"
<< "\n\tyou must set a stream for this object before you can peek at what it contains"
<< "\n\tthis: " << this
);
// call the real function
return tokenizer::peek_type();
}
// ----------------------------------------------------------------------------------------
template <
typename tokenizer
>
void cpp_tokenizer_kernel_c<tokenizer>::
get_token (
int& type,
std::string& token
)
{
// make sure requires clause is not broken
DLIB_CASSERT( this->stream_is_set() == true,
"\tvoid cpp_tokenizer::get_token()"
<< "\n\tyou must set a stream for this object before you can get tokens from it."
<< "\n\tthis: " << this
);
// call the real function
tokenizer::get_token(type,token);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_TOKENIZER_KERNEl_C_