The Labs \ Source Viewer \ SSCLI \ System.Text \ EUCJPEncoding

  1. // ==++==
  2. //
  3. //
  4. // Copyright (c) 2006 Microsoft Corporation. All rights reserved.
  5. //
  6. // The use and distribution terms for this software are contained in the file
  7. // named license.txt, which can be found in the root of this distribution.
  8. // By using this software in any fashion, you are agreeing to be bound by the
  9. // terms of this license.
  10. //
  11. // You must not remove this notice, or any other, from this software.
  12. //
  13. //
  14. // ==--==
  15. namespace System.Text
  16. {
  17.     using System.Text;
  18.     using System.Globalization;
  19.    
  20.     // EUCJPEncoding
  21.     //
  22.     // EUC-JP Encoding (51932)
  23.     //
  24.     // EUC-JP has the following code points:
  25.     // 00-7F - ASCII
  26.     // 80-8D & 90-9F - Control. (Like Unicode, except for 8e and 8f)
  27.     // A1-FE, A1-FE - 2 byte JIS X 0208 range.
  28.     // 8E, A1-DF - 2 byte half-width Katakana
  29.     // 8F, A1-FE, A1-FE - 3 byte JIX X 0212 range. WE DON'T USE JIS 0212!!!
  30.     //
  31.     // New thoughts:
  32.     // Fixing windows 20932 code page so that all characters can be looked up there.
  33.     //
  34.     // Old thoughts:
  35.     // Windows NLS uses a special CP20932 for EUC-JP, but it is not used by mlang. Windows
  36.     // Maps the 3 byte ranges to the 2 byte CP20932 by masking the 2nd byte with & 0x7F.
  37.     // MLang uses the native windows 932 code page, which is more reliable, however the code points
  38.     // don't line up as nicely as the 20932 code page, however it doesn't have JIS X 0212 support.
  39.     //
  40.     // So what we do is:
  41.     // 1. For ASCII, leave it alone
  42.     // 2. For half-width Katakana, use the leading byte and convert with 20936 code page.
  43.     // 3. For JIS X 0208, Use the leading & trailing bytes with 20936 code page
  44.     // 4. For JIS X 0212, Remove the lead byte, & 0xFF7F, and use the CP20936 table to convert.
  45.     //
  46.     // Regarding Normalization:
  47.     // Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings
  48.     // Form D is precluded because of 0x00a8, which changes to space + dierises.
  49.     //
  50.     // I think that IsAlwaysNormalized should probably return true for form C (but not certain)
  51.     //
  52.     // NOTE: We don't use JIS 0212 so we are basically a DBCS code page, we just have to modify
  53.     // the 932 table we're basing this on.
  54.     //
  55.    
  56.     using System;
  57.    
  58.     [Serializable()]
  59.     internal class EUCJPEncoding : DBCSCodePageEncoding
  60.     {
  61.         // This pretends to be CP 932 as far as memory tables are concerned.
  62.         public EUCJPEncoding() : base(51932, 932)
  63.         {
  64.             this.m_bUseMlangTypeForSerialization = true;
  65.         }
  66.        
  67.         unsafe protected override string GetMemorySectionName()
  68.         {
  69.             int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage;
  70.            
  71.             string strName = String.Format(CultureInfo.InvariantCulture, "CodePage_{0}_{1}_{2}_{3}_{4}_EUCJP", iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor, this.pCodePage->VersionRevision, this.pCodePage->VersionBuild);
  72.            
  73.             return strName;
  74.         }
  75.        
  76.         // Clean up characters for EUC-JP code pages, etc.
  77.         protected override bool CleanUpBytes(ref int bytes)
  78.         {
  79.             if (bytes >= 256) {
  80.                 // map extended char (0xfa40-0xfc4b) to a special range
  81.                 // (ported from mlang)
  82.                 if (bytes >= 64064 && bytes <= 64587) {
  83.                     if (bytes >= 64064 && bytes <= 64091) {
  84.                         if (bytes <= 64073)
  85.                             bytes = bytes - 2897;
  86.                         else if (bytes >= 64074 && bytes <= 64083)
  87.                             bytes = bytes - 29430;
  88.                         else if (bytes >= 64084 && bytes <= 64087)
  89.                             bytes = bytes - 2907;
  90.                         else if (bytes == 64088)
  91.                             bytes = 34698;
  92.                         else if (bytes == 64089)
  93.                             bytes = 34690;
  94.                         else if (bytes == 64090)
  95.                             bytes = 34692;
  96.                         else if (bytes == 64091)
  97.                             bytes = 34714;
  98.                     }
  99.                     else if (bytes >= 64092 && bytes <= 64587) {
  100.                         byte tc = unchecked((byte)bytes);
  101.                         if (tc < 92)
  102.                             bytes = bytes - 3423;
  103.                         else if (tc >= 128 && tc <= 155)
  104.                             bytes = bytes - 3357;
  105.                         else
  106.                             bytes = bytes - 3356;
  107.                     }
  108.                 }
  109.                
  110.                 // Convert 932 code page to 20932 like code page range
  111.                 // (also ported from mlang)
  112.                 byte bLead = unchecked((byte)(bytes >> 8));
  113.                 byte bTrail = unchecked((byte)bytes);
  114.                
  115.                 bLead -= ((bLead > (byte)159) ? (byte)177 : (byte)113);
  116.                 bLead = (byte)((bLead << 1) + 1);
  117.                 if (bTrail > (byte)158) {
  118.                     bTrail -= (byte)126;
  119.                     bLead++;
  120.                 }
  121.                 else {
  122.                     if (bTrail > (byte)126)
  123.                         bTrail--;
  124.                     bTrail -= (byte)31;
  125.                 }
  126.                
  127.                 bytes = ((int)bLead) << 8 | (int)bTrail | 32896;
  128.                
  129.                
  130.                 if ((bytes & 65280) < 41216 || (bytes & 65280) > 65024 || (bytes & 255) < 161 || (bytes & 255) > 254)
  131.                     return false;
  132.                
  133.             }
  134.             else {
  135.                 // For 51932 1/2 Katakana gets a 0x8E lead byte
  136.                 // Adjust 1/2 Katakana
  137.                 if (bytes >= 161 && bytes <= 223) {
  138.                     bytes |= 36352;
  139.                     return true;
  140.                 }
  141.                
  142.                 // 0x81-0x9f and 0xe0-0xfc CP 932
  143.                 // 0x8e and 0xa1-0xfe CP 20932 (we don't use 8e though)
  144.                 // b0-df is 1/2 Katakana
  145.                 // So 81-9f & e0-fc are 932 lead bytes, a1-fe are our lead bytes
  146.                 // so ignore everything above 0x80 except 0xa0 and 0xff
  147.                 if (bytes >= 129 && bytes != 160 && bytes != 255) {
  148.                     // We set diffent lead bytes later, so just return false
  149.                     return false;
  150.                 }
  151.             }
  152.            
  153.             return true;
  154.         }
  155.        
  156.         unsafe protected override void CleanUpEndBytes(char* chars)
  157.         {
  158.             // Need to special case CP 51932
  159.             // 0x81-0x9f and 0xe0-0xfc CP 932
  160.             // 0x8e and 0xa1-0xfe CP 20932
  161.             // 0x10 and 0x21-0x9? Us (remapping 932)
  162.             // b0-df is 1/2 Katakana (trail byte)
  163.            
  164.             // A1-FE are DBCS code points
  165.             for (int i = 161; i <= 254; i++)
  166.                 chars[i] = LEAD_BYTE_CHAR;
  167.            
  168.             // And 8E is lead byte for Katakana (already set)
  169.             chars[142] = LEAD_BYTE_CHAR;
  170.         }
  171.     }
  172. }

Developer Fusion