是的:
如果您想将单个字符(如普通的破折号“-”)视为空格,我会使用ctype facet。此类型指定本地如何处理字符。在这种情况下,我们可以告诉 facet '-' 是一种空格。
#include <locale>
#include <fstream>
#include <iostream>
#include <string>
#include <sstream>
// This is my facet:
// It adds the '-' character to the set of characters treated like a space.
class DashSepFacet: public std::ctype<char>
{
public:
typedef std::ctype<char> base;
typedef base::char_type char_type;
DashSepFacet(std::locale const& l) : base(table)
{
// Get the ctype facet of the current locale
std::ctype<char> const& defaultCType = std::use_facet<std::ctype<char> >(l);
// Copy the default flags for each character from the current facet
static char data[256];
for(int loop = 0; loop < 256; ++loop) {data[loop] = loop;}
defaultCType.is(data, data+256, table);
// Add the '-' as a space
table['-'] |= base::space;
}
private:
base::mask table[256];
};
int main()
{
// Create a stream (Create the locale) then imbue the stream.
std::fstream data;
data.imbue(std::locale(data.getloc(), new DashSepFacet(data.getloc())));
data.open("X3");
// Now you can use the stream like normal; your locale defines what
// is whitespace, so the operator `>>` will split on dash.
std::string word;
while(data >> word)
{
std::cout << "Word(" << word << ")\n";
}
}
现在我们得到:
> ./a.out
Word(He)
Word(was)
Word(young—perhaps)
Word(from)
Word(twenty)
Word(eight)
Word(to)
Word(thirty—tall)
Word(slender)
不幸的是,em-dash 是一个 unicode 代码点,实际上由 3 个字符表示,因此上述技术不起作用。相反,您可以使用char_traits facet 告诉本地转换字符序列(通常用于在格式之间转换)。在这种情况下,我们编写了一个将em-dash 转换为文字空格字符的版本。
#include <locale>
#include <fstream>
#include <iostream>
#include <string>
#include <sstream>
#include <locale>
#include <string>
#include <iostream>
#include <fstream>
#include <cctype>
class PunctRemove: public std::codecvt<char,char,std::char_traits<char>::state_type>
{
bool do_always_noconv() const throw() { return false;}
int do_encoding() const throw() { return true; }
typedef std::codecvt<char,char,std::char_traits<char>::state_type> MyType;
typedef MyType::state_type state_type;
typedef MyType::result result;
virtual result do_in(state_type& s,
const char* from,const char* from_end,const char*& from_next,
char* to, char* to_limit, char*& to_next ) const
{
// Unicode for em-dash is
// e2 80 94
static int emdashpos = 0;
/*
* This function is used to filter the input
*/
for(from_next = from, to_next = to;from_next != from_end;++from_next)
{
// Note we do it this way.
// because the multi byte em-dash may be split across buffer boundaries.
if (emdashpos == 0 && *from_next == '\xe2') {
++emdashpos;
continue;
}
else if (emdashpos == 1 && *from_next == '\x80') {
++emdashpos;
continue;
}
else if (emdashpos == 2 && *from_next == '\x94') {
*to_next = ' ';
++to_next;
emdashpos=0;
continue;
}
// --- Account for times when we received some characters but not all
if (emdashpos != 0) {
from_next -= emdashpos;
emdashpos = 0;
}
// Normal processing.
*to_next = *from_next;
++to_next;
}
return ok;
}
/*
* This function is used to filter the output
*/
virtual result do_out(state_type& state,
const char* from, const char* from_end, const char*& from_next,
char* to, char* to_limit, char*& to_next ) const
{ /* Write if you need it */ return ok;}
};
int main()
{
// Create a stream (Create the locale) then imbue the stream.
std::ifstream data;
data.imbue(std::locale(data.getloc(), new PunctRemove()));
data.open("X3");
// Now you can use the stream like normal; your locale is replacing the em-dash
// with a normal space.
std::string word;
while(data >> word)
{
std::cout << "Word(" << word << ")\n";
}
}
现在我们得到:
> ./a.out
Word(He)
Word(was)
Word(young)
Word(perhaps)
Word(from)
Word(twenty-eight)
Word(to)
Word(thirty)
Word(tall)
Word(slender)