FREE Subscription to Dr. Dobb’s Digest: Same Great Content, New Digital Edition
Site Archive (Complete)
Dobbs M-Dev
Email
Print
Reprint

add to:
Del.icio.us
Digg
Google
Furl
Slashdot
Y! MyWeb
Blink
October 01, 2003

URL Canonicalization Testing

(Page 8 of 16)
URL Canonicalization Testing

Listing 6 ConvertToUtf8


// from the header:
// const unsigned long utfPrimaryLeadByte_0;
// const unsigned long utfPrimaryLeadByte_1110;
// const unsigned long utfPrimaryLeadByte_11110;
// const unsigned long utfSecondaryLeadByte;
// unsigned long overlongUtfExtraBytesToAdd; <== constructor parameter

std::wstring CUtf8Encoder::ConvertToUtf8(wchar_t characterToEncode) const
    {
    typedef const unsigned long Byte;
    const std::wstring prefix(L"%");

    std::vector<std::wstring> encodings;

    const unsigned long codepoint(
        static_cast<const unsigned long>(characterToEncode));
    if (codepoint <= 0x7F)
        {
        Byte byteOne(utfPrimaryLeadByte_0 | codepoint);
        encodings.push_back
            (
            prefix + CConverters::NumberToStringAsBase(byteOne, 16, 2)
            );
        }
    if (codepoint <= 0x7FF)
        {
        Byte byteOne(utfSecondaryLeadByte | GetRightBits(codepoint, 1, 6));
        Byte byteTwo(utfPrimaryLeadByte_1110 | GetRightBits(codepoint, 7, 5));
        encodings.push_back
            (
              prefix + CConverters::NumberToStringAsBase(byteTwo, 16, 2)
            + prefix + CConverters::NumberToStringAsBase(byteOne, 16, 2)
            );
        }
    if (codepoint <= 0xFFFF)
        {
        Byte byteOne(utfSecondaryLeadByte | GetRightBits(codepoint, 1, 6));
        Byte byteTwo(utfSecondaryLeadByte | GetRightBits(codepoint, 7, 6));
        Byte byteThree(utfPrimaryLeadByte_11110 
            | GetRightBits(codepoint, 13, 4));
        encodings.push_back
            (
              prefix + CConverters::NumberToStringAsBase(byteThree, 16, 2)
            + prefix + CConverters::NumberToStringAsBase(byteTwo, 16, 2)
            + prefix + CConverters::NumberToStringAsBase(byteOne, 16, 2)
            );
        }
    // And so on and so forth through the rest of the character ranges.
    }

    if (overlongUtfExtraBytesToAdd < encodings.size())
        {
        return encodings[overlongUtfExtraBytesToAdd];
        }
    else
        {
        return encodings[encodings.size() - 1];
        }
    }
unsigned long CUtf8Encoder::GetRightBits(unsigned long value
    , unsigned long indexOfFirstBit, unsigned long numberOfBitsToGet) const
    {
    // Pull out the specified number of bits, starting the specified number of
    // of bits from the right-hand side.  For example, given 
    // "0110 1011 0001 1101", GetRightBits(3, 6) will return "0000 0111".
    return RotateBitsRight(value, indexOfFirstBit - 1) 
        & GetRightAlignedMaskBits(numberOfBitsToGet);
    }
unsigned long CUtf8Encoder::GetRightAlignedMaskBits(
    unsigned long numberOfBitsToMask) const
    {
    // Generate a solid right-aligned bit mask of the specified size.
    return RotateBitsRight(0xFFFFFFFF, (32 - numberOfBitsToMask));
    }
unsigned long CUtf8Encoder::RotateBitsRight(unsigned long value
    , unsigned long numberOfBitsToRotate) const
    {
    // Rotates all bits in the specified value the specified number of digits
    // right, dropping the rightmost bits.
    return value >> numberOfBitsToRotate;
    }

Previous Page | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 Next Page
TOP 5 ARTICLES
No Top Articles.



MICROSITES
FEATURED TOPIC

ADDITIONAL TOPICS

INFO-LINK