Download: Reese.zip
The process of finding groups from one set of characters that are
separated by characters from another set is called tokenizing.
The groups of interest or "words" are known as tokens. The characters
separating them are called delimiters or separators.
C provides the function strtok
that accepts text and any
set of delimiters and tokenizes the text. C++ does not have the
equivalent of strtok
. However, this example demonstrates
a function (tokenize
) that does
the same thing and is easier to use.
string_tokenize.cpp
// string_tokenize.cpp #include <algorithm> #include <iostream> #include <iterator> #include <string> #include <vector> using namespace std; vector<string> tokenize( const string& text, const char* delimiters=0 ); void print(vector<string> &tokens) { int i, n; n = tokens.size(); for (i=0; i<n; i++) { cout << i+1 << " " << tokens[i] << endl; } } int main( ) { vector<string> tokens; const char file_delimiters[] = ":\\."; string phrase = "How much wood would a woodchuck chuck?"; // find words tokens = tokenize(phrase); cout << "Tokens in " << phrase << endl; print(tokens); string file( "c:\\reese\\book\\code\\string_tokenize.cpp" ); // find the parts of the file name tokens = tokenize( file, file_delimiters ); // display the tokens cout << "TOKENS IN " << file << endl; print( tokens ); // try a file with no delimiters file = "data"; tokens = tokenize( file, file_delimiters ); cout << "\nTOKENS IN " << file << endl; print( tokens ); // try a file that's all delimiters file = ".."; tokens = tokenize( file, file_delimiters ); cout << "\nTOKENS IN " << file << endl; print( tokens ); // try different delimiters by finding the numbers // in a Social Security number string social_security( "431-02-9495" ); tokens = tokenize( social_security, "-" ); cout << "\nTOKENS IN " << social_security << endl; print( tokens ); } vector<string> tokenize( const string& text, const char* delimiters) { vector<string> tokens; bool done = false; // can't use NULL pointer in find_first_of if( delimiters == 0 ) delimiters = " "; string::size_type start = 0; // beginning index of token string::size_type finish; // ending index of token while( !done ) { // find the next character that is not a delimiter start = text.find_first_not_of( delimiters, start ); // if there is a character that is not a delimiter... if( start != string::npos ) { // find the next character after it that is a delimiter finish = text.find_first_of( delimiters, start ); // if there is such a delimiter, the token is all the // characters from the starting character to just before // the delimiter if( finish != string::npos ) { tokens.push_back( text.substr( start, finish-start ) ); start = finish; // use finish, not finish+1 } // if there is not such a delimiter, the token is all the // characters from the starting character to the end of the // string. Bail out because there's no more text to look at else { tokens.push_back( text.substr( start, text.length()-start ) ); break; } } // all remaining characters are delimiters else break; } return tokens; }
Tokens in How much wood would a woodchuck chuck? 1 How 2 much 3 wood 4 would 5 a 6 woodchuck 7 chuck? TOKENS IN c:\reese\book\code\string_tokenize.cpp 1 c 2 reese 3 book 4 code 5 string_tokenize 6 cpp TOKENS IN data 1 data TOKENS IN .. TOKENS IN 431-02-9495 1 431 2 02 3 9495
Maintained by John Loomis, updated Wed Feb 14 23:13:55 2007