The Labs \ Source Viewer \ SSCLI \ System.Globalization \ CharUnicodeInfo

  1. // ==++==
  2. //
  3. //
  4. // Copyright (c) 2006 Microsoft Corporation. All rights reserved.
  5. //
  6. // The use and distribution terms for this software are contained in the file
  7. // named license.txt, which can be found in the root of this distribution.
  8. // By using this software in any fashion, you are agreeing to be bound by the
  9. // terms of this license.
  10. //
  11. // You must not remove this notice, or any other, from this software.
  12. //
  13. //
  14. // ==--==
  15. ////////////////////////////////////////////////////////////////////////////
  16. //
  17. // Class: CharacterInfo
  18. //
  19. // Purpose: This class implements a set of methods for retrieving
  20. // character type information. Character type information is
  21. // independent of culture and region.
  22. //
  23. // Date: August 12, 1998
  24. //
  25. ////////////////////////////////////////////////////////////////////////////
  26. namespace System.Globalization
  27. {
  28.    
  29.     //This class has only static members and therefore doesn't need to be serialized.
  30.    
  31.     using System;
  32.     using System.Threading;
  33.     using System.Runtime.InteropServices;
  34.     using System.Runtime.CompilerServices;
  35.     using System.Reflection;
  36.    
  37.     // Only statics, does not need to be marked with the serializable attribute
  38.    
  39.    
  40.     public sealed class CharUnicodeInfo : object
  41.     {
  42.         //--------------------------------------------------------------------//
  43.         // Internal Information //
  44.         //--------------------------------------------------------------------//
  45.        
  46.         //
  47.         // Native methods to access the Unicode category data tables in charinfo.nlp.
  48.         //
  49.         internal const char HIGH_SURROGATE_START = '�';
  50.         internal const char HIGH_SURROGATE_END = '�';
  51.         internal const char LOW_SURROGATE_START = '�';
  52.         internal const char LOW_SURROGATE_END = '�';
  53.        
  54.         internal const int UNICODE_CATEGORY_OFFSET = 0;
  55.         internal const int BIDI_CATEGORY_OFFSET = 1;
  56.        
  57.         // The base pointer of the data table
  58.         unsafe static byte* m_pDataTable;
  59.        
  60.         // The native pointer to the 12:4:4 index table of the Unicode cateogry data.
  61.         unsafe static ushort* m_pCategoryLevel1Index;
  62.         unsafe static byte* m_pCategoriesValue;
  63.        
  64.         // The native pointer to the 12:4:4 index table of the Unicode numeric data.
  65.         // The value of this index table is an index into the real value table stored in m_pNumericValues.
  66.         unsafe static ushort* m_pNumericLevel1Index;
  67.        
  68.         // The numeric value table, which is indexed by m_pNumericLevel1Index.
  69.         // Every item contains the value for numeric value.
  70.         // unsafe static double* m_pNumericValues;
  71.         // To get around the IA64 alignment issue. Our double data is aligned in 8-byte boundary, but loader loads the embeded table starting
  72.         // at 4-byte boundary. This cause a alignment issue since double is 8-byte.
  73.         unsafe static byte* m_pNumericValues;
  74.        
  75.         // The digit value table, which is indexed by m_pNumericLevel1Index. It shares the same indice as m_pNumericValues.
  76.         // Every item contains the value for decimal digit/digit value.
  77.         unsafe static DigitValues* m_pDigitValues;
  78.        
  79.         internal const string UNICODE_INFO_FILE_NAME = "charinfo.nlp";
  80.         // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
  81.         internal const int UNICODE_PLANE01_START = 65536;
  82.        
  83.        
  84.         //
  85.         // This is the header for the native data table that we load from UNICODE_INFO_FILE_NAME.
  86.         //
  87.         // Excplicit layout is used here since a syntax like char[16] can not be used in sequential layout.
  88.         [StructLayout(LayoutKind.Explicit)]
  89.         unsafe internal struct UnicodeDataHeader
  90.         {
  91.             [FieldOffset(0)]
  92.             internal char TableName;
  93.             // WCHAR[16]
  94.             [FieldOffset(32)]
  95.             internal ushort version;
  96.             // WORD[4]
  97.             [FieldOffset(40)]
  98.             internal uint OffsetToCategoriesIndex;
  99.             // DWORD
  100.             [FieldOffset(44)]
  101.             internal uint OffsetToCategoriesValue;
  102.             // DWORD
  103.             [FieldOffset(48)]
  104.             internal uint OffsetToNumbericIndex;
  105.             // DWORD
  106.             [FieldOffset(52)]
  107.             internal uint OffsetToDigitValue;
  108.             // DWORD
  109.             [FieldOffset(56)]
  110.             internal uint OffsetToNumbericValue;
  111.             // DWORD
  112.         }
  113.        
  114.         // NOTE: It's important to specify pack size here, since the size of the structure is 2 bytes. Otherwise,
  115.         // the default pack size will be 4.
  116.        
  117.         [StructLayout(LayoutKind.Sequential, Pack = 2)]
  118.         internal struct DigitValues
  119.         {
  120.             internal sbyte decimalDigit;
  121.             internal sbyte digit;
  122.         }
  123.        
  124.        
  125.         //We need to allocate the underlying table that provides us with the information that we
  126.         //use. We allocate this once in the class initializer and then we don't need to worry
  127.         //about it again.
  128.         //
  129.         unsafe static CharUnicodeInfo()
  130.         {
  131.             m_pDataTable = GlobalizationAssembly.GetGlobalizationResourceBytePtr(typeof(CharUnicodeInfo).Assembly, UNICODE_INFO_FILE_NAME);
  132.             UnicodeDataHeader* mainHeader = (UnicodeDataHeader*)m_pDataTable;
  133.            
  134.             // Set up the native pointer to different part of the tables.
  135.             m_pCategoryLevel1Index = (ushort*)(m_pDataTable + mainHeader->OffsetToCategoriesIndex);
  136.             m_pCategoriesValue = (byte*)(m_pDataTable + mainHeader->OffsetToCategoriesValue);
  137.             m_pNumericLevel1Index = (ushort*)(m_pDataTable + mainHeader->OffsetToNumbericIndex);
  138.             m_pNumericValues = (byte*)(m_pDataTable + mainHeader->OffsetToNumbericValue);
  139.             m_pDigitValues = (DigitValues*)(m_pDataTable + mainHeader->OffsetToDigitValue);
  140.            
  141.             // Go to native side to make sure the native CharacterInfoTable pointer in the native side is initialized.
  142.             nativeInitTable(m_pDataTable);
  143.         }
  144.        
  145.        
  146.         ////////////////////////////////////////////////////////////////////////
  147.         //
  148.         // Define a private ctor so that compiler won't generate a default public ctor for us.
  149.         //
  150.         ////////////////////////////////////////////////////////////////////////
  151.         private CharUnicodeInfo()
  152.         {
  153.         }
  154.        
  155.        
  156.         ////////////////////////////////////////////////////////////////////////
  157.         //
  158.         // Actions:
  159.         // Convert the BMP character or surrogate pointed by index to a UTF32 value.
  160.         // This is similar to Char.ConvertToUTF32, but the difference is that
  161.         // it does not throw exceptions when invalid surrogate characters are passed in.
  162.         //
  163.         // WARNING: since it doesn't throw an exception it CAN return a value
  164.         // in the surrogate range D800-DFFF, which are not legal unicode values.
  165.         //
  166.         ////////////////////////////////////////////////////////////////////////
  167.        
  168.         static internal int InternalConvertToUtf32(string s, int index)
  169.         {
  170.             BCLDebug.Assert(s != null, "s != null");
  171.             BCLDebug.Assert(index >= 0 && index < s.Length, "index < s.Length");
  172.             if (index < s.Length - 1) {
  173.                 int temp1 = (int)s[index] - HIGH_SURROGATE_START;
  174.                 if (temp1 >= 0 && temp1 <= 1023) {
  175.                     int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
  176.                     if (temp2 >= 0 && temp2 <= 1023) {
  177.                         // Convert the surrogate to UTF32 and get the result.
  178.                         return ((temp1 * 1024) + temp2 + UNICODE_PLANE01_START);
  179.                     }
  180.                 }
  181.             }
  182.             return ((int)s[index]);
  183.         }
  184.        
  185.         ////////////////////////////////////////////////////////////////////////
  186.         //
  187.         // Convert a character or a surrogate pair starting at index of string s
  188.         // to UTF32 value.
  189.         //
  190.         // Parameters:
  191.         // s The string
  192.         // index The starting index. It can point to a BMP character or
  193.         // a surrogate pair.
  194.         // len The length of the string.
  195.         // charLength [out] If the index points to a BMP char, charLength
  196.         // will be 1. If the index points to a surrogate pair,
  197.         // charLength will be 2.
  198.         //
  199.         // WARNING: since it doesn't throw an exception it CAN return a value
  200.         // in the surrogate range D800-DFFF, which are not legal unicode values.
  201.         //
  202.         // Returns:
  203.         // The UTF32 value
  204.         //
  205.         ////////////////////////////////////////////////////////////////////////
  206.        
  207.         static internal int InternalConvertToUtf32(string s, int index, out int charLength)
  208.         {
  209.             BCLDebug.Assert(s != null, "s != null");
  210.             BCLDebug.Assert(s.Length > 0, "s.Length > 0");
  211.             BCLDebug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
  212.             charLength = 1;
  213.             if (index < s.Length - 1) {
  214.                 int temp1 = (int)s[index] - HIGH_SURROGATE_START;
  215.                 if (temp1 >= 0 && temp1 <= 1023) {
  216.                     int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
  217.                     if (temp2 >= 0 && temp2 <= 1023) {
  218.                         // Convert the surrogate to UTF32 and get the result.
  219.                         charLength++;
  220.                         return ((temp1 * 1024) + temp2 + UNICODE_PLANE01_START);
  221.                     }
  222.                 }
  223.             }
  224.             return ((int)s[index]);
  225.         }
  226.        
  227.         ////////////////////////////////////////////////////////////////////////
  228.         //
  229.         // IsWhiteSpace
  230.         //
  231.         // Determines if the given character is a white space character.
  232.         //
  233.         ////////////////////////////////////////////////////////////////////////
  234.        
  235.         static internal bool IsWhiteSpace(string s, int index)
  236.         {
  237.             BCLDebug.Assert(s != null, "s!=null");
  238.             BCLDebug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
  239.            
  240.             UnicodeCategory uc = GetUnicodeCategory(s, index);
  241.             // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
  242.             // And U+2029 is th eonly character which is under the category "ParagraphSeparator".
  243.             switch (uc) {
  244.                 case (UnicodeCategory.SpaceSeparator):
  245.                 case (UnicodeCategory.LineSeparator):
  246.                 case (UnicodeCategory.ParagraphSeparator):
  247.                     return (true);
  248.             }
  249.             return (false);
  250.         }
  251.        
  252.        
  253.         static internal bool IsWhiteSpace(char c)
  254.         {
  255.             UnicodeCategory uc = GetUnicodeCategory(c);
  256.             // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
  257.             // And U+2029 is th eonly character which is under the category "ParagraphSeparator".
  258.             switch (uc) {
  259.                 case (UnicodeCategory.SpaceSeparator):
  260.                 case (UnicodeCategory.LineSeparator):
  261.                 case (UnicodeCategory.ParagraphSeparator):
  262.                     return (true);
  263.             }
  264.            
  265.             return (false);
  266.         }
  267.        
  268.         //
  269.         // This is called by the public char and string, index versions
  270.         //
  271.         // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character
  272.         //
  273.         unsafe static internal double InternalGetNumericValue(int ch)
  274.         {
  275.             BCLDebug.Assert(ch >= 0 && ch <= 1114111, "ch is not in valid Unicode range.");
  276.             // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
  277.             ushort index = m_pNumericLevel1Index[ch >> 8];
  278.             // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
  279.             // The offset is referred to an float item in m_pNumericFloatData.
  280.             // Note that & has the lower precedence than addition, so don't forget the parathesis.
  281.             index = m_pNumericLevel1Index[index + ((ch >> 4) & 15)];
  282.             byte* pBytePtr = (byte*)&(m_pNumericLevel1Index[index]);
  283.             // Get the result from the 0 -3 bit of ch.
  284.             return (((double*)m_pNumericValues)[pBytePtr[(ch & 15)]]);
  285.         }
  286.        
  287.         //
  288.         // This is called by the public char and string, index versions
  289.         //
  290.         // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character
  291.         //
  292.         unsafe static internal DigitValues* InternalGetDigitValues(int ch)
  293.         {
  294.             BCLDebug.Assert(ch >= 0 && ch <= 1114111, "ch is not in valid Unicode range.");
  295.             // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
  296.             ushort index = m_pNumericLevel1Index[ch >> 8];
  297.             // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
  298.             // The offset is referred to an float item in m_pNumericFloatData.
  299.             // Note that & has the lower precedence than addition, so don't forget the parathesis.
  300.             index = m_pNumericLevel1Index[index + ((ch >> 4) & 15)];
  301.             byte* pBytePtr = (byte*)&(m_pNumericLevel1Index[index]);
  302.             // Get the result from the 0 -3 bit of ch.
  303.             return &(m_pDigitValues[pBytePtr[(ch & 15)]]);
  304.         }
  305.        
  306.        
  307.         unsafe static internal sbyte InternalGetDecimalDigitValue(int ch)
  308.         {
  309.             return (InternalGetDigitValues(ch)->decimalDigit);
  310.         }
  311.        
  312.         unsafe static internal sbyte InternalGetDigitValue(int ch)
  313.         {
  314.             return (InternalGetDigitValues(ch)->digit);
  315.         }
  316.        
  317.        
  318.         ////////////////////////////////////////////////////////////////////////
  319.         //
  320.         //Returns the numeric value associated with the character c. If the character is a fraction,
  321.         // the return value will not be an integer. If the character does not have a numeric value, the return value is -1.
  322.         //
  323.         //Returns:
  324.         // the numeric value for the specified Unicode character. If the character does not have a numeric value, the return value is -1.
  325.         //Arguments:
  326.         // ch a Unicode character
  327.         //Exceptions:
  328.         // ArgumentNullException
  329.         // ArgumentOutOfRangeException
  330.         //
  331.         ////////////////////////////////////////////////////////////////////////
  332.        
  333.        
  334.         public static double GetNumericValue(char ch)
  335.         {
  336.             return (InternalGetNumericValue(ch));
  337.         }
  338.        
  339.        
  340.         public static double GetNumericValue(string s, int index)
  341.         {
  342.             if (s == null) {
  343.                 throw new ArgumentNullException("s");
  344.             }
  345.             if (index < 0 || index >= s.Length) {
  346.                 throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
  347.             }
  348.             return (InternalGetNumericValue(InternalConvertToUtf32(s, index)));
  349.            
  350.         }
  351.        
  352.         ////////////////////////////////////////////////////////////////////////
  353.         //
  354.         //Returns the decimal digit value associated with the character c.
  355.         //
  356.         // The value should be from 0 ~ 9.
  357.         // If the character does not have a numeric value, the return value is -1.
  358.         // From Unicode.org: Decimal Digits. Digits that can be used to form decimal-radix numbers.
  359.         //Returns:
  360.         // the decimal digit value for the specified Unicode character. If the character does not have a decimal digit value, the return value is -1.
  361.         //Arguments:
  362.         // ch a Unicode character
  363.         //Exceptions:
  364.         // ArgumentNullException
  365.         // ArgumentOutOfRangeException
  366.         //
  367.         ////////////////////////////////////////////////////////////////////////
  368.        
  369.        
  370.         public static int GetDecimalDigitValue(char ch)
  371.         {
  372.             return (InternalGetDecimalDigitValue(ch));
  373.         }
  374.        
  375.        
  376.         public static int GetDecimalDigitValue(string s, int index)
  377.         {
  378.             if (s == null) {
  379.                 throw new ArgumentNullException("s");
  380.             }
  381.             if (index < 0 || index >= s.Length) {
  382.                 throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
  383.             }
  384.            
  385.             return (InternalGetDecimalDigitValue(InternalConvertToUtf32(s, index)));
  386.         }
  387.        
  388.         ////////////////////////////////////////////////////////////////////////
  389.         //
  390.         //Action: Returns the digit value associated with the character c.
  391.         // If the character does not have a numeric value, the return value is -1.
  392.         // From Unicode.org: If the character represents a digit, not necessarily a decimal digit,
  393.         // the value is here. This covers digits which do not form decimal radix forms, such as the compatibility superscript digits.
  394.         //
  395.         // An example is: U+2460 IRCLED DIGIT ONE. This character has digit value 1, but does not have associcated decimal digit value.
  396.         //
  397.         //Returns:
  398.         // the digit value for the specified Unicode character. If the character does not have a digit value, the return value is -1.
  399.         //Arguments:
  400.         // ch a Unicode character
  401.         //Exceptions:
  402.         // ArgumentNullException
  403.         // ArgumentOutOfRangeException
  404.         //
  405.         ////////////////////////////////////////////////////////////////////////
  406.        
  407.        
  408.         public static int GetDigitValue(char ch)
  409.         {
  410.             return (InternalGetDigitValue(ch));
  411.         }
  412.        
  413.        
  414.         public static int GetDigitValue(string s, int index)
  415.         {
  416.             if (s == null) {
  417.                 throw new ArgumentNullException("s");
  418.             }
  419.             if (index < 0 || index >= s.Length) {
  420.                 throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
  421.             }
  422.             return (InternalGetDigitValue(InternalConvertToUtf32(s, index)));
  423.         }
  424.        
  425.         public static UnicodeCategory GetUnicodeCategory(char ch)
  426.         {
  427.             return (InternalGetUnicodeCategory(ch));
  428.         }
  429.        
  430.         public static UnicodeCategory GetUnicodeCategory(string s, int index)
  431.         {
  432.             if (s == null)
  433.                 throw new ArgumentNullException("s");
  434.             if (((uint)index) >= ((uint)s.Length)) {
  435.                 throw new ArgumentOutOfRangeException("index");
  436.             }
  437.             return InternalGetUnicodeCategory(s, index);
  438.         }
  439.        
  440.         unsafe static internal UnicodeCategory InternalGetUnicodeCategory(int ch)
  441.         {
  442.             return ((UnicodeCategory)InternalGetCategoryValue(ch, UNICODE_CATEGORY_OFFSET));
  443.         }
  444.        
  445.         ////////////////////////////////////////////////////////////////////////
  446.         //
  447.         //Action: Returns the Unicode Category property for the character c.
  448.         //Returns:
  449.         // an value in UnicodeCategory enum
  450.         //Arguments:
  451.         // ch a Unicode character
  452.         //Exceptions:
  453.         // None
  454.         //
  455.         //Note that this API will return values for D800-DF00 surrogate halves.
  456.         //
  457.         ////////////////////////////////////////////////////////////////////////
  458.        
  459.         unsafe static internal byte InternalGetCategoryValue(int ch, int offset)
  460.         {
  461.             BCLDebug.Assert(ch >= 0 && ch <= 1114111, "ch is not in valid Unicode range.");
  462.             // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
  463.             ushort index = m_pCategoryLevel1Index[ch >> 8];
  464.             // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
  465.             // Note that & has the lower precedence than addition, so don't forget the parathesis.
  466.             index = m_pCategoryLevel1Index[index + ((ch >> 4) & 15)];
  467.             byte* pBytePtr = (byte*)&(m_pCategoryLevel1Index[index]);
  468.             // Get the result from the 0 -3 bit of ch.
  469.             byte valueIndex = pBytePtr[(ch & 15)];
  470.             byte uc = m_pCategoriesValue[valueIndex * 2 + offset];
  471.             //
  472.             // Make sure that OtherNotAssigned is the last category in UnicodeCategory.
  473.             // If that changes, change the following assertion as well.
  474.             //
  475.             //BCLDebug.Assert(uc >= 0 && uc <= UnicodeCategory.OtherNotAssigned, "Table returns incorrect Unicode category");
  476.             return (uc);
  477.         }
  478.        
  479.         // internal static BidiCategory GetBidiCategory(char ch) {
  480.         // return ((BidiCategory)InternalGetCategoryValue(c, BIDI_CATEGORY_OFFSET));
  481.         // }
  482.        
  483.         static internal BidiCategory GetBidiCategory(string s, int index)
  484.         {
  485.             if (s == null)
  486.                 throw new ArgumentNullException("s");
  487.             if (((uint)index) >= ((uint)s.Length)) {
  488.                 throw new ArgumentOutOfRangeException("index");
  489.             }
  490.             return ((BidiCategory)InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
  491.         }
  492.        
  493.         ////////////////////////////////////////////////////////////////////////
  494.         //
  495.         //Action: Returns the Unicode Category property for the character c.
  496.         //Returns:
  497.         // an value in UnicodeCategory enum
  498.         //Arguments:
  499.         // value a Unicode String
  500.         // index Index for the specified string.
  501.         //Exceptions:
  502.         // None
  503.         //
  504.         ////////////////////////////////////////////////////////////////////////
  505.        
  506.         static internal UnicodeCategory InternalGetUnicodeCategory(string value, int index)
  507.         {
  508.             BCLDebug.Assert(value != null, "value can not be null");
  509.             BCLDebug.Assert(index < value.Length, "index < value.Length");
  510.            
  511.             return (InternalGetUnicodeCategory(InternalConvertToUtf32(value, index)));
  512.         }
  513.        
  514.         ////////////////////////////////////////////////////////////////////////
  515.         //
  516.         // Get the Unicode category of the character starting at index. If the character is in BMP, charLength will return 1.
  517.         // If the character is a valid surrogate pair, charLength will return 2.
  518.         //
  519.         ////////////////////////////////////////////////////////////////////////
  520.        
  521.         static internal UnicodeCategory InternalGetUnicodeCategory(string str, int index, out int charLength)
  522.         {
  523.             BCLDebug.Assert(str != null, "str can not be null");
  524.             BCLDebug.Assert(str.Length > 0, "str.Length > 0");
  525.             ;
  526.             BCLDebug.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length");
  527.            
  528.             return (InternalGetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength)));
  529.         }
  530.        
  531.         static internal bool IsCombiningCategory(UnicodeCategory uc)
  532.         {
  533.             BCLDebug.Assert(uc >= 0, "uc >= 0");
  534.             return (uc == UnicodeCategory.NonSpacingMark || uc == UnicodeCategory.SpacingCombiningMark || uc == UnicodeCategory.EnclosingMark);
  535.         }
  536.        
  537.         [MethodImplAttribute(MethodImplOptions.InternalCall)]
  538.         unsafe private static extern void nativeInitTable(byte* bytePtr);
  539.        
  540.     }
  541. }

Developer Fusion