October 01, 2003
URL Canonicalization Testing
URL Canonicalization Testing
Listing 6 ConvertToUtf8
// from the header:
// const unsigned long utfPrimaryLeadByte_0;
// const unsigned long utfPrimaryLeadByte_1110;
// const unsigned long utfPrimaryLeadByte_11110;
// const unsigned long utfSecondaryLeadByte;
// unsigned long overlongUtfExtraBytesToAdd; <== constructor parameter
std::wstring CUtf8Encoder::ConvertToUtf8(wchar_t characterToEncode) const
{
typedef const unsigned long Byte;
const std::wstring prefix(L"%");
std::vector<std::wstring> encodings;
const unsigned long codepoint(
static_cast<const unsigned long>(characterToEncode));
if (codepoint <= 0x7F)
{
Byte byteOne(utfPrimaryLeadByte_0 | codepoint);
encodings.push_back
(
prefix + CConverters::NumberToStringAsBase(byteOne, 16, 2)
);
}
if (codepoint <= 0x7FF)
{
Byte byteOne(utfSecondaryLeadByte | GetRightBits(codepoint, 1, 6));
Byte byteTwo(utfPrimaryLeadByte_1110 | GetRightBits(codepoint, 7, 5));
encodings.push_back
(
prefix + CConverters::NumberToStringAsBase(byteTwo, 16, 2)
+ prefix + CConverters::NumberToStringAsBase(byteOne, 16, 2)
);
}
if (codepoint <= 0xFFFF)
{
Byte byteOne(utfSecondaryLeadByte | GetRightBits(codepoint, 1, 6));
Byte byteTwo(utfSecondaryLeadByte | GetRightBits(codepoint, 7, 6));
Byte byteThree(utfPrimaryLeadByte_11110
| GetRightBits(codepoint, 13, 4));
encodings.push_back
(
prefix + CConverters::NumberToStringAsBase(byteThree, 16, 2)
+ prefix + CConverters::NumberToStringAsBase(byteTwo, 16, 2)
+ prefix + CConverters::NumberToStringAsBase(byteOne, 16, 2)
);
}
// And so on and so forth through the rest of the character ranges.
}
if (overlongUtfExtraBytesToAdd < encodings.size())
{
return encodings[overlongUtfExtraBytesToAdd];
}
else
{
return encodings[encodings.size() - 1];
}
}
unsigned long CUtf8Encoder::GetRightBits(unsigned long value
, unsigned long indexOfFirstBit, unsigned long numberOfBitsToGet) const
{
// Pull out the specified number of bits, starting the specified number of
// of bits from the right-hand side. For example, given
// "0110 1011 0001 1101", GetRightBits(3, 6) will return "0000 0111".
return RotateBitsRight(value, indexOfFirstBit - 1)
& GetRightAlignedMaskBits(numberOfBitsToGet);
}
unsigned long CUtf8Encoder::GetRightAlignedMaskBits(
unsigned long numberOfBitsToMask) const
{
// Generate a solid right-aligned bit mask of the specified size.
return RotateBitsRight(0xFFFFFFFF, (32 - numberOfBitsToMask));
}
unsigned long CUtf8Encoder::RotateBitsRight(unsigned long value
, unsigned long numberOfBitsToRotate) const
{
// Rotates all bits in the specified value the specified number of digits
// right, dropping the rightmost bits.
return value >> numberOfBitsToRotate;
}
Previous Page |
1
|
2
|
3
|
4
|
5
|
6
|
7
|
8
|
9
|
10
|
11
|
12
|
13
|
14
|
15
|
16
Next Page