Download: Reese.zip
The process of finding groups from one set of characters that are
separated by characters from another set is called tokenizing.
The groups of interest or "words" are known as tokens. The characters
separating them are called delimiters or separators.
C provides the function strtok that accepts text and any
set of delimiters and tokenizes the text. C++ does not have the
equivalent of strtok. However, this example demonstrates
a function (tokenize) that does
the same thing and is easier to use.
string_tokenize.cpp// string_tokenize.cpp
#include <algorithm>
#include <iostream>
#include <iterator>
#include <string>
#include <vector>
using namespace std;
vector<string> tokenize( const string& text, const char* delimiters=0 );
void print(vector<string> &tokens)
{
int i, n;
n = tokens.size();
for (i=0; i<n; i++) {
cout << i+1 << " " << tokens[i] << endl;
}
}
int main( )
{
vector<string> tokens;
const char file_delimiters[] = ":\\.";
string phrase = "How much wood would a woodchuck chuck?";
// find words
tokens = tokenize(phrase);
cout << "Tokens in " << phrase << endl;
print(tokens);
string file( "c:\\reese\\book\\code\\string_tokenize.cpp" );
// find the parts of the file name
tokens = tokenize( file, file_delimiters );
// display the tokens
cout << "TOKENS IN " << file << endl;
print( tokens );
// try a file with no delimiters
file = "data";
tokens = tokenize( file, file_delimiters );
cout << "\nTOKENS IN " << file << endl;
print( tokens );
// try a file that's all delimiters
file = "..";
tokens = tokenize( file, file_delimiters );
cout << "\nTOKENS IN " << file << endl;
print( tokens );
// try different delimiters by finding the numbers
// in a Social Security number
string social_security( "431-02-9495" );
tokens = tokenize( social_security, "-" );
cout << "\nTOKENS IN " << social_security << endl;
print( tokens );
}
vector<string> tokenize( const string& text, const char* delimiters)
{
vector<string> tokens;
bool done = false;
// can't use NULL pointer in find_first_of
if( delimiters == 0 ) delimiters = " ";
string::size_type start = 0; // beginning index of token
string::size_type finish; // ending index of token
while( !done )
{
// find the next character that is not a delimiter
start = text.find_first_not_of( delimiters, start );
// if there is a character that is not a delimiter...
if( start != string::npos )
{
// find the next character after it that is a delimiter
finish = text.find_first_of( delimiters, start );
// if there is such a delimiter, the token is all the
// characters from the starting character to just before
// the delimiter
if( finish != string::npos )
{
tokens.push_back( text.substr( start, finish-start ) );
start = finish; // use finish, not finish+1
}
// if there is not such a delimiter, the token is all the
// characters from the starting character to the end of the
// string. Bail out because there's no more text to look at
else
{
tokens.push_back( text.substr( start,
text.length()-start ) );
break;
}
}
// all remaining characters are delimiters
else
break;
}
return tokens;
}
Tokens in How much wood would a woodchuck chuck? 1 How 2 much 3 wood 4 would 5 a 6 woodchuck 7 chuck? TOKENS IN c:\reese\book\code\string_tokenize.cpp 1 c 2 reese 3 book 4 code 5 string_tokenize 6 cpp TOKENS IN data 1 data TOKENS IN .. TOKENS IN 431-02-9495 1 431 2 02 3 9495
Maintained by John Loomis, updated Wed Feb 14 23:13:55 2007