• Skip to content
  • Skip to link menu
KDE 4.3 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

kencodingdetector.cpp

Go to the documentation of this file.
00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 1999 Lars Knoll (knoll@kde.org)
00005     Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
00006     Copyright (C) 2003 Apple Computer, Inc.
00007     Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
00008 
00009     This library is free software; you can redistribute it and/or
00010     modify it under the terms of the GNU Library General Public
00011     License as published by the Free Software Foundation; either
00012     version 2 of the License, or (at your option) any later version.
00013 
00014     This library is distributed in the hope that it will be useful,
00015     but WITHOUT ANY WARRANTY; without even the implied warranty of
00016     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017     Library General Public License for more details.
00018 
00019     You should have received a copy of the GNU Library General Public License
00020     along with this library; see the file COPYING.LIB.  If not, write to
00021     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00022     Boston, MA 02110-1301, USA.
00023 */
00024 //----------------------------------------------------------------------------
00025 //
00026 // decoder for input stream
00027 
00028 #include "kencodingdetector.h"
00029 
00030 #undef DECODE_DEBUG
00031 //#define DECODE_DEBUG
00032 
00033 #define MAX_BUFFER 16*1024
00034 
00035 #include <assert.h>
00036 
00037 #include "guess_ja_p.h"
00038 
00039 #include <QRegExp>
00040 #include <QTextCodec>
00041 
00042 #include <kglobal.h>
00043 #include <kcharsets.h>
00044 #include <kdebug.h>
00045 #include <klocale.h>
00046 
00047 #include <ctype.h>
00048 
00049 enum MIB
00050 {
00051     MibLatin1  = 4,
00052     Mib8859_8  = 85,
00053     MibUtf8    = 106,
00054     MibUcs2    = 1000,
00055     MibUtf16   = 1015,
00056     MibUtf16BE = 1013,
00057     MibUtf16LE = 1014
00058 };
00059 
00060 static bool is16Bit(QTextCodec* codec)
00061 {
00062     switch (codec->mibEnum())
00063     {
00064     case MibUtf16:
00065     case MibUtf16BE:
00066     case MibUtf16LE:
00067     case MibUcs2:
00068         return true;
00069     default:
00070         return false;
00071     }
00072 }
00073 
00074 class KEncodingDetectorPrivate
00075 {
00076 public:
00077     QTextCodec *m_codec;
00078     QTextDecoder *m_decoder; // utf16
00079     QTextCodec *m_defaultCodec;
00080     QByteArray  m_storeDecoderName;
00081 
00082     KEncodingDetector::EncodingChoiceSource m_source;
00083     KEncodingDetector::AutoDetectScript m_autoDetectLanguage;
00084 
00085     bool m_visualRTL : 1;
00086     bool m_seenBody : 1;
00087     bool m_writtingHappened : 1;
00088     bool m_analyzeCalled : 1; //for decode()
00089     int m_multiByte;
00090 
00091     QByteArray m_bufferForDefferedEncDetection;
00092 
00093     KEncodingDetectorPrivate()
00094             : m_codec(QTextCodec::codecForMib(MibLatin1))
00095             , m_decoder(m_codec->makeDecoder())
00096             , m_defaultCodec(m_codec)
00097             , m_source(KEncodingDetector::DefaultEncoding)
00098             , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
00099             , m_visualRTL(false)
00100             , m_seenBody(false)
00101             , m_writtingHappened(false)
00102             , m_analyzeCalled(false)
00103             , m_multiByte(0)
00104     {
00105     }
00106 
00107     KEncodingDetectorPrivate(QTextCodec* codec,KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script)
00108             : m_codec(codec)
00109             , m_decoder(m_codec->makeDecoder())
00110             , m_defaultCodec(m_codec)
00111             , m_source(source)
00112             , m_autoDetectLanguage(script)
00113             , m_visualRTL(false)
00114             , m_seenBody(false)
00115             , m_writtingHappened(false)
00116             , m_analyzeCalled(false)
00117             , m_multiByte(0)
00118     {
00119     }
00120 
00121     ~KEncodingDetectorPrivate()
00122     {
00123         delete m_decoder;
00124     }
00125 
00126     // Returns true if the encoding was explicitly specified someplace.
00127     bool isExplicitlySpecifiedEncoding()
00128     {
00129         return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding;
00130     }
00131 };
00132 
00133 
00134 static QByteArray automaticDetectionForArabic( const unsigned char* ptr, int size )
00135 {
00136     for ( int i = 0; i < size; ++i ) {
00137         if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
00138              || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
00139              || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
00140              || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
00141             return "cp1256";
00142         }
00143     }
00144 
00145     return "iso-8859-6";
00146 }
00147 
00148 static QByteArray automaticDetectionForBaltic( const unsigned char* ptr, int size )
00149 {
00150     for ( int i = 0; i < size; ++i ) {
00151         if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
00152              return "cp1257";
00153 
00154         if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
00155             return "iso-8859-13";
00156     }
00157 
00158     return "iso-8859-13";
00159 }
00160 
00161 static QByteArray automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
00162 {
00163     QByteArray charset = QByteArray();
00164     for ( int i = 0; i < size; ++i ) {
00165         if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
00166             if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
00167                 return "ibm852";
00168 
00169             if ( i + 1 > size )
00170                 return "cp1250";
00171             else { // maybe ibm852 ?
00172                 charset = "cp1250";
00173                 continue;
00174             }
00175         }
00176         if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
00177             if ( i + 1 > size )
00178                 return "iso-8859-2";
00179             else {  // maybe ibm852 ?
00180                 if ( charset.isNull() )
00181                     charset = "iso-8859-2";
00182                 continue;
00183             }
00184         }
00185     }
00186 
00187     if ( charset.isNull() )
00188         charset = "iso-8859-3";
00189 
00190     return charset.data();
00191 }
00192 
00193 static QByteArray automaticDetectionForCyrillic( const unsigned char* ptr, int size)
00194 {
00195 #ifdef DECODE_DEBUG
00196         kWarning() << "KEncodingDetector: Cyr heuristics";
00197 #endif
00198 
00199 //     if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
00200 //         return "utf8";
00201     int utf8_mark=0;
00202     int koi_score=0;
00203     int cp1251_score=0;
00204 
00205     int koi_st=0;
00206     int cp1251_st=0;
00207 
00208 //     int koi_na=0;
00209 //     int cp1251_na=0;
00210 
00211     int koi_o_capital=0;
00212     int koi_o=0;
00213     int cp1251_o_capital=0;
00214     int cp1251_o=0;
00215 
00216     int koi_a_capital=0;
00217     int koi_a=0;
00218     int cp1251_a_capital=0;
00219     int cp1251_a=0;
00220 
00221     int koi_s_capital=0;
00222     int koi_s=0;
00223     int cp1251_s_capital=0;
00224     int cp1251_s=0;
00225 
00226     int koi_i_capital=0;
00227     int koi_i=0;
00228     int cp1251_i_capital=0;
00229     int cp1251_i=0;
00230 
00231     int cp1251_small_range=0;
00232     int koi_small_range=0;
00233     int ibm866_small_range=0;
00234 
00235     int i;
00236     for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
00237     {
00238         if (ptr[i]>0xdf)
00239         {
00240             ++cp1251_small_range;
00241 
00242             if (ptr[i]==0xee)//small o
00243                 ++cp1251_o;
00244             else if (ptr[i]==0xe0)//small a
00245                 ++cp1251_a;
00246             else if (ptr[i]==0xe8)//small i
00247                 ++cp1251_i;
00248             else if (ptr[i]==0xf1)//small s
00249                 ++cp1251_s;
00250             else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
00251                 ++cp1251_st;
00252 
00253             else if (ptr[i]==0xef)
00254                 ++koi_o_capital;
00255             else if (ptr[i]==0xe1)
00256                 ++koi_a_capital;
00257             else if (ptr[i]==0xe9)
00258                 ++koi_i_capital;
00259             else if (ptr[i]==0xf3)
00260                 ++koi_s_capital;
00261 
00262         }
00263         else if (ptr[i]>0xbf)
00264         {
00265             ++koi_small_range;
00266 
00267             if (ptr[i]==0xd0||ptr[i]==0xd1)//small o
00268                 ++utf8_mark;
00269             else if (ptr[i]==0xcf)//small o
00270                 ++koi_o;
00271             else if (ptr[i]==0xc1)//small a
00272                 ++koi_a;
00273             else if (ptr[i]==0xc9)//small i
00274                 ++koi_i;
00275             else if (ptr[i]==0xd3)//small s
00276                 ++koi_s;
00277             else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
00278                 ++koi_st;
00279 
00280             else if (ptr[i]==0xce)
00281                 ++cp1251_o_capital;
00282             else if (ptr[i]==0xc0)
00283                 ++cp1251_a_capital;
00284             else if (ptr[i]==0xc8)
00285                 ++cp1251_i_capital;
00286             else if (ptr[i]==0xd1)
00287                 ++cp1251_s_capital;
00288         }
00289         else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60%
00290             ++ibm866_small_range;
00291 
00292     }
00293 
00294     //cannot decide?
00295     if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
00296     {
00297         return "";
00298     }
00299 
00300     if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
00301     {
00302 #ifdef DECODE_DEBUG
00303         kWarning() << "Cyr Enc Detection: UTF8";
00304 #endif
00305         return "UTF-8";
00306     }
00307 
00308     if (ibm866_small_range>cp1251_small_range+koi_small_range)
00309         return "ibm866";
00310 
00311 //     QByteArray koi_string = "koi8-u";
00312 //     QByteArray cp1251_string = "cp1251";
00313 
00314     if (cp1251_st==0 && koi_st>1)
00315         koi_score+=10;
00316     else if (koi_st==0 && cp1251_st>1)
00317         cp1251_score+=10;
00318 
00319     if (cp1251_st && koi_st)
00320     {
00321         if (cp1251_st/koi_st>2)
00322             cp1251_score+=20;
00323         else if (koi_st/cp1251_st>2)
00324             koi_score+=20;
00325     }
00326 
00327     if (cp1251_a>koi_a)
00328         cp1251_score+=10;
00329     else if (cp1251_a || koi_a)
00330         koi_score+=10;
00331 
00332     if (cp1251_o>koi_o)
00333         cp1251_score+=10;
00334     else if (cp1251_o || koi_o)
00335         koi_score+=10;
00336 
00337     if (cp1251_i>koi_i)
00338         cp1251_score+=10;
00339     else if (cp1251_i || koi_i)
00340         koi_score+=10;
00341 
00342     if (cp1251_s>koi_s)
00343         cp1251_score+=10;
00344     else if (cp1251_s || koi_s)
00345         koi_score+=10;
00346 
00347     if (cp1251_a_capital>koi_a_capital)
00348         cp1251_score+=9;
00349     else if (cp1251_a_capital || koi_a_capital)
00350         koi_score+=9;
00351 
00352     if (cp1251_o_capital>koi_o_capital)
00353         cp1251_score+=9;
00354     else if (cp1251_o_capital || koi_o_capital)
00355         koi_score+=9;
00356 
00357     if (cp1251_i_capital>koi_i_capital)
00358         cp1251_score+=9;
00359     else if (cp1251_i_capital || koi_i_capital)
00360         koi_score+=9;
00361 
00362     if (cp1251_s_capital>koi_s_capital)
00363         cp1251_score+=9;
00364     else if (cp1251_s_capital || koi_s_capital)
00365         koi_score+=9;
00366 #ifdef DECODE_DEBUG
00367     kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
00368 #endif
00369     if (abs(koi_score-cp1251_score)<10)
00370     {
00371         //fallback...
00372         cp1251_score=cp1251_small_range;
00373         koi_score=koi_small_range;
00374     }
00375     if (cp1251_score>koi_score)
00376         return "cp1251";
00377     else
00378         return "koi8-u";
00379 
00380 
00381 //     if (cp1251_score>koi_score)
00382 //         setEncoding("cp1251",AutoDetectedEncoding);
00383 //     else
00384 //         setEncoding("koi8-u",AutoDetectedEncoding);
00385 //     return true;
00386 
00387 }
00388 
00389 static QByteArray automaticDetectionForGreek( const unsigned char* ptr, int size )
00390 {
00391     for ( int i = 0; i < size; ++i ) {
00392         if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
00393              || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
00394              || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
00395             return "cp1253";
00396         }
00397     }
00398 
00399     return "iso-8859-7";
00400 }
00401 
00402 static QByteArray automaticDetectionForHebrew( const unsigned char* ptr, int size )
00403 {
00404     for ( int i = 0; i < size; ++i ) {
00405         if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
00406              || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
00407              || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
00408             return "cp1255";
00409         }
00410 
00411         if ( ptr[ i ] == 0xDF )
00412             return "iso-8859-8-i";
00413     }
00414 
00415     return "iso-8859-8-i";
00416 }
00417 
00418 static QByteArray automaticDetectionForJapanese( const unsigned char* ptr, int size )
00419 {
00420     JapaneseCode kc;
00421 
00422     switch ( kc.guess_jp( (const char*)ptr, size ) ) {
00423     case JapaneseCode::JIS:
00424         return "jis7";
00425     case JapaneseCode::EUC:
00426         return "eucjp";
00427     case JapaneseCode::SJIS:
00428         return "sjis";
00429      case JapaneseCode::UTF8:
00430         return "utf8";
00431     default:
00432         break;
00433     }
00434 
00435     return "";
00436 }
00437 
00438 static QByteArray automaticDetectionForTurkish( const unsigned char* ptr, int size )
00439 {
00440     for ( int i = 0; i < size; ++i ) {
00441         if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
00442             return "cp1254";
00443         }
00444     }
00445 
00446     return "iso-8859-9";
00447 }
00448 
00449 static QByteArray automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
00450 {
00451     --size;
00452     uint nonansi_count=0;
00453     for (int i=0; i<size; ++i)
00454     {
00455         if (ptr[i]>0x79)
00456         {
00457              ++nonansi_count;
00458             if ( ptr[i]>0xc1 && ptr[i]<0xf0 && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
00459             {
00460                 return "UTF-8";
00461             }
00462             if (ptr[i] >= 0x78 && ptr[i]<=0x9F )
00463             {
00464                 return "cp1252";
00465             }
00466         }
00467 
00468     }
00469 
00470     if (nonansi_count>0)
00471         return "iso-8859-15";
00472 
00473     return "";
00474 }
00475 
00476 // Other browsers allow comments in the head section, so we need to also.
00477 // It's important not to look for tags inside the comments.
00478 static void skipComment(const char *&ptr, const char *pEnd)
00479 {
00480     const char *p = ptr;
00481     // Allow <!-->; other browsers do.
00482     if (*p=='>')
00483     {
00484         p++;
00485     }
00486     else
00487     {
00488         while (p!=pEnd)
00489         {
00490             if (*p=='-')
00491             {
00492                 // This is the real end of comment, "-->".
00493                 if (p[1]=='-' && p[2]=='>')
00494                 {
00495                     p += 3;
00496                     break;
00497                 }
00498                 // This is the incorrect end of comment that other browsers allow, "--!>".
00499                 if (p[1] == '-' && p[2] == '!' && p[3] == '>')
00500                 {
00501                     p += 4;
00502                     break;
00503                 }
00504             }
00505             p++;
00506         }
00507     }
00508     ptr=p;
00509 }
00510 
00511 // Returns the position of the encoding string.
00512 static int findXMLEncoding(const QByteArray &str, int &encodingLength)
00513 {
00514     int len = str.length();
00515     int pos = str.indexOf("encoding");
00516     if (pos == -1)
00517         return -1;
00518     pos += 8;
00519 
00520     // Skip spaces and stray control characters.
00521     while (pos<len && str[pos]<=' ')
00522         ++pos;
00523 
00524     //Bail out if nothing after
00525     // Skip equals sign.
00526     if (pos>=len || str[pos] != '=')
00527         return -1;
00528     ++pos;
00529 
00530     // Skip spaces and stray control characters.
00531     while (pos<len && str[pos]<=' ')
00532         ++pos;
00533 
00534     //Bail out if nothing after
00535     if (pos >= len)
00536         return -1;
00537 
00538     // Skip quotation mark.
00539     char quoteMark = str[pos];
00540     if (quoteMark != '"' && quoteMark != '\'')
00541         return -1;
00542     ++pos;
00543 
00544     // Find the trailing quotation mark.
00545     int end=pos;
00546     while (end<len && str[end]!=quoteMark)
00547         ++end;
00548 
00549     if (end>=len)
00550         return -1;
00551 
00552     encodingLength = end-pos;
00553     return pos;
00554 }
00555 
00556 bool KEncodingDetector::processNull(char *data, int len)
00557 {
00558     bool bin=false;
00559     if(is16Bit(d->m_codec))
00560     {
00561         for (int i=1; i < len; i+=2)
00562         {
00563             if ((data[i]=='\0') && (data[i-1]=='\0'))
00564             {
00565                 bin=true;
00566                 data[i]=' ';
00567             }
00568         }
00569         return bin;
00570     }
00571     // replace '\0' by spaces, for buggy pages
00572     int i = len-1;
00573     while(--i>=0)
00574     {
00575         if(data[i]==0)
00576         {
00577             bin=true;
00578             data[i]=' ';
00579         }
00580     }
00581     return bin;
00582 }
00583 
00584 
00585 bool KEncodingDetector::errorsIfUtf8 (const char* data, int length)
00586 {
00587     if (d->m_codec->mibEnum()!=MibUtf8)
00588         return false; //means no errors
00589 // #define highest1Bits (unsigned char)0x80
00590 // #define highest2Bits (unsigned char)0xC0
00591 // #define highest3Bits (unsigned char)0xE0
00592 // #define highest4Bits (unsigned char)0xF0
00593 // #define highest5Bits (unsigned char)0xF8
00594 static const unsigned char highest1Bits = 0x80;
00595 static const unsigned char highest2Bits = 0xC0;
00596 static const unsigned char highest3Bits = 0xE0;
00597 static const unsigned char highest4Bits = 0xF0;
00598 static const unsigned char highest5Bits = 0xF8;
00599 
00600     for (int i=0; i<length; ++i)
00601     {
00602         unsigned char c = data[i];
00603 
00604         if (d->m_multiByte>0)
00605         {
00606             if ((c & highest2Bits) == 0x80)
00607             {
00608                 --(d->m_multiByte);
00609                 continue;
00610             }
00611 #ifdef DECODE_DEBUG
00612             kWarning() << "EncDetector: Broken UTF8";
00613 #endif
00614             return true;
00615         }
00616 
00617         // most significant bit zero, single char
00618         if ((c & highest1Bits) == 0x00)
00619             continue;
00620 
00621         // 110xxxxx => init 1 following bytes
00622         if ((c & highest3Bits) == 0xC0)
00623         {
00624             d->m_multiByte = 1;
00625             continue;
00626         }
00627 
00628         // 1110xxxx => init 2 following bytes
00629         if ((c & highest4Bits) == 0xE0)
00630         {
00631             d->m_multiByte = 2;
00632             continue;
00633         }
00634 
00635         // 11110xxx => init 3 following bytes
00636         if ((c & highest5Bits) == 0xF0)
00637         {
00638             d->m_multiByte = 3;
00639             continue;
00640         }
00641 #ifdef DECODE_DEBUG
00642         kWarning() << "EncDetector:_Broken UTF8";
00643 #endif
00644         return true;
00645     }
00646     return false;
00647 }
00648 
00649 
00650 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate)
00651 {
00652 }
00653 
00654 KEncodingDetector::KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
00655     d(new KEncodingDetectorPrivate(codec,source,script))
00656 {
00657 }
00658 
00659 KEncodingDetector::~KEncodingDetector()
00660 {
00661     delete d;
00662 }
00663 
00664 void KEncodingDetector::setAutoDetectLanguage( KEncodingDetector::AutoDetectScript lang)
00665 {
00666     d->m_autoDetectLanguage=lang;
00667 }
00668 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const
00669 {
00670     return d->m_autoDetectLanguage;
00671 }
00672 
00673 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const
00674 {
00675     return d->m_source;
00676 }
00677 
00678 const char* KEncodingDetector::encoding() const
00679 {
00680     d->m_storeDecoderName = d->m_codec->name();
00681     return d->m_storeDecoderName.constData();
00682 }
00683 
00684 bool KEncodingDetector::visuallyOrdered() const
00685 {
00686     return d->m_visualRTL;
00687 }
00688 
00689 // const QTextCodec* KEncodingDetector::codec() const
00690 // {
00691 //     return d->m_codec;
00692 // }
00693 
00694 QTextDecoder* KEncodingDetector::decoder()
00695 {
00696     return d->m_decoder;
00697 }
00698 
00699 void KEncodingDetector::resetDecoder()
00700 {
00701     assert(d->m_defaultCodec);
00702     d->m_bufferForDefferedEncDetection.clear();
00703     d->m_writtingHappened = false;
00704     d->m_analyzeCalled = false;
00705     d->m_multiByte = 0;
00706     delete d->m_decoder;
00707     if (!d->m_codec)
00708         d->m_codec = d->m_defaultCodec;
00709     d->m_decoder = d->m_codec->makeDecoder();
00710 }
00711 
00712 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
00713 {
00714     QTextCodec *codec;
00715     QByteArray enc(_encoding);
00716     if(/*enc.isNull() || */enc.isEmpty())
00717     {
00718         if (type==DefaultEncoding)
00719             codec=d->m_defaultCodec;
00720         else
00721             return false;
00722     }
00723     else
00724     {
00725         //QString->QTextCodec
00726 
00727         enc = enc.toLower();
00728          // hebrew visually ordered
00729         if(enc=="visual")
00730             enc="iso8859-8";
00731         bool b;
00732         codec = KGlobal::charsets()->codecForName(enc, b);
00733         if (!b)
00734         return false;
00735     }
00736 
00737     if (d->m_codec->mibEnum()==codec->mibEnum())
00738     {
00739         // We already have the codec, but we still want to re-set the type,
00740         // as we may have overwritten a default with a detected
00741         d->m_source = type;
00742         return true;
00743     }
00744 
00745     if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
00746     {
00747         //Sometimes the codec specified is absurd, i.e. UTF-16 despite
00748         //us decoding a meta tag as ASCII. In that case, ignore it.
00749         return false;
00750     }
00751 
00752     if (codec->mibEnum() == Mib8859_8)
00753     {
00754         //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
00755         codec = QTextCodec::codecForName("iso8859-8-i");
00756 
00757         // visually ordered unless one of the following
00758         if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
00759             d->m_visualRTL = true;
00760     }
00761 
00762     d->m_codec = codec;
00763     d->m_source = type;
00764     delete d->m_decoder;
00765     d->m_decoder = d->m_codec->makeDecoder();
00766 #ifdef DECODE_DEBUG
00767     kDebug(6005) << "KEncodingDetector::encoding used is" << d->m_codec->name();
00768 #endif
00769     return true;
00770 }
00771 
00772 QString KEncodingDetector::decode(const char *data, int len)
00773 {
00774     processNull(const_cast<char *>(data),len);
00775     if (!d->m_analyzeCalled)
00776     {
00777         analyze(data,len);
00778         d->m_analyzeCalled=true;
00779     }
00780 
00781     return d->m_decoder->toUnicode(data,len);
00782 }
00783 
00784 QString KEncodingDetector::decode(const QByteArray &data)
00785 {
00786     processNull(const_cast<char *>(data.data()),data.size());
00787     if (!d->m_analyzeCalled)
00788     {
00789         analyze(data.data(),data.size());
00790         d->m_analyzeCalled=true;
00791     }
00792 
00793     return d->m_decoder->toUnicode(data);
00794 }
00795 
00796 QString KEncodingDetector::decodeWithBuffering(const char *data, int len)
00797 {
00798 #ifdef DECODE_DEBUG
00799         kWarning() << "KEncodingDetector: decoding "<<len<<" bytes";
00800 #endif
00801     if (d->m_writtingHappened)
00802     {
00803 #ifdef DECODE_DEBUG
00804         kWarning() << "KEncodingDetector: d->m_writtingHappened "<< d->m_codec->name();
00805 #endif
00806         processNull(const_cast<char *>(data),len);
00807         return d->m_decoder->toUnicode(data, len);
00808     }
00809     else
00810     {
00811         if (d->m_bufferForDefferedEncDetection.isEmpty())
00812         {
00813             // If encoding detection produced something, and we either got to the body or
00814             // actually saw the encoding explicitly, we're done.
00815             if (analyze(data,len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding()))
00816             {
00817 #ifdef DECODE_DEBUG
00818                 kWarning() << "KEncodingDetector: m_writtingHappened first time "<< d->m_codec->name();
00819 #endif
00820                 processNull(const_cast<char *>(data),len);
00821                 d->m_writtingHappened=true;
00822                 return d->m_decoder->toUnicode(data, len);
00823             }
00824             else
00825             {
00826 #ifdef DECODE_DEBUG
00827                 kWarning() << "KEncodingDetector: begin deffer";
00828 #endif
00829                 d->m_bufferForDefferedEncDetection=data;
00830             }
00831         }
00832         else
00833         {
00834             d->m_bufferForDefferedEncDetection+=data;
00835             // As above, but also limit the buffer size. We must use the entire buffer here,
00836             // since the boundaries might split the meta tag, etc.
00837             bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
00838             if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
00839                  d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER)
00840             {
00841                 d->m_writtingHappened=true;
00842                 d->m_bufferForDefferedEncDetection.replace('\0',' ');
00843                 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
00844                 d->m_bufferForDefferedEncDetection.clear();
00845 #ifdef DECODE_DEBUG
00846                 kWarning() << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
00847 #endif
00848                 return result;
00849             }
00850         }
00851     }
00852 
00853     return QString();
00854 }
00855 
00856 bool KEncodingDetector::decodedInvalidCharacters() const
00857 {
00858     return d->m_decoder ? d->m_decoder->hasFailure() : false;
00859 }
00860 
00861 QString KEncodingDetector::flush()
00862 {
00863     if (d->m_bufferForDefferedEncDetection.isEmpty())
00864         return QString();
00865 
00866     d->m_bufferForDefferedEncDetection.replace('\0',' ');
00867     QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
00868     d->m_bufferForDefferedEncDetection.clear();
00869 #ifdef DECODE_DEBUG
00870     kWarning() << "KEncodingDetector:flush() "<< d->m_bufferForDefferedEncDetection.length()<<" bytes "<< d->m_codec->name();
00871 #endif
00872     return result;
00873 }
00874 
00875 bool KEncodingDetector::analyze(const char *data, int len)
00876 {
00877     // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
00878     // maximumBOMLength = 10
00879     // Even if the user has chosen utf16 we still need to auto-detect the endianness
00880     if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
00881     {
00882         // Extract the first three bytes.
00883         const uchar *udata = (const uchar *)data;
00884         uchar c1 = *udata++;
00885         uchar c2 = *udata++;
00886         uchar c3 = *udata++;
00887 
00888         // Check for the BOM
00889         const char *autoDetectedEncoding;
00890         if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
00891         {
00892             autoDetectedEncoding = "UTF-16";
00893         }
00894         else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
00895         {
00896             autoDetectedEncoding = "UTF-8";
00897         }
00898         else if (c1 == 0x00 || c2 == 0x00)
00899         {
00900             uchar c4 = *udata++;
00901             uchar c5 = *udata++;
00902             uchar c6 = *udata++;
00903             uchar c7 = *udata++;
00904             uchar c8 = *udata++;
00905             uchar c9 = *udata++;
00906             uchar c10 = *udata++;
00907 
00908             int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
00909             int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
00910             if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
00911                 autoDetectedEncoding = "UTF-16";
00912             else
00913                 autoDetectedEncoding = 0;
00914         }
00915         else
00916         {
00917             autoDetectedEncoding = 0;
00918         }
00919 
00920         // If we found a BOM, use the encoding it implies.
00921         if (autoDetectedEncoding != 0)
00922         {
00923             d->m_source = BOM;
00924             d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
00925             assert(d->m_codec);
00926             //enc = d->m_codec->name();
00927             delete d->m_decoder;
00928             d->m_decoder = d->m_codec->makeDecoder();
00929 #ifdef DECODE_DEBUG
00930             kWarning() << "Detection by BOM";
00931 #endif
00932             if (is16Bit(d->m_codec) && c2==0x00)
00933             {
00934                 // utf16LE, we need to put the decoder in LE mode
00935                 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
00936                 d->m_decoder->toUnicode(reverseUtf16, 2);
00937             }
00938             return true;
00939         }
00940     }
00941 
00942     //exit from routine in case it was called to only detect byte order for utf-16
00943     if (d->m_source==UserChosenEncoding)
00944     {
00945 #ifdef DECODE_DEBUG
00946         kWarning() << "KEncodingDetector: UserChosenEncoding exit ";
00947 #endif
00948 
00949         if (errorsIfUtf8(data, len))
00950             setEncoding("",DefaultEncoding);
00951         return true;
00952     }
00953 
00954     // HTTP header takes precedence over meta-type stuff
00955     if (d->m_source==EncodingFromHTTPHeader)
00956         return true;
00957 
00958     if (!d->m_seenBody)
00959     {
00960         // we still don't have an encoding, and are in the head
00961         // the following tags are allowed in <head>:
00962         // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
00963         const char *ptr = data;
00964         const char *pEnd = data+len;
00965 
00966         while(ptr != pEnd)
00967         {
00968             if(*ptr!='<')
00969             {
00970                 ++ptr;
00971                 continue;
00972             }
00973             ++ptr;
00974             // Handle comments.
00975             if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
00976             {
00977                 ptr += 3;
00978                 skipComment(ptr, pEnd);
00979                 continue;
00980             }
00981 
00982             // Handle XML header, which can have encoding in it.
00983             if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
00984             {
00985                 const char *end = ptr;
00986                 while (*end != '>' && end < pEnd)
00987                     end++;
00988                 if (*end == '\0' || end == pEnd)
00989                     break;
00990                 QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator
00991                 int length;
00992                 int pos = findXMLEncoding(str, length);
00993                 // also handles the case when specified encoding aint correct
00994                 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
00995                 {
00996                     return true;
00997                 }
00998             }
00999 
01000             //look for <meta>, stop if we reach <body>
01001             while (
01002                         !(((*ptr >= 'a') && (*ptr <= 'z')) ||
01003                         ((*ptr >= 'A') && (*ptr <= 'Z')))
01004                         && ptr < pEnd
01005                 )
01006                 ++ptr;
01007 
01008             char tmp[5];
01009             int length=0;
01010             const char* max=ptr+4;
01011             if (pEnd<max)
01012                 max=pEnd;
01013             while (
01014                         (((*ptr >= 'a') && (*ptr <= 'z')) ||
01015                         ((*ptr >= 'A') && (*ptr <= 'Z')) ||
01016                         ((*ptr >= '0') && (*ptr <= '9')))
01017                         && ptr < max
01018                 )
01019             {
01020                 tmp[length] = tolower( *ptr );
01021                 ++ptr;
01022                 ++length;
01023             }
01024             tmp[length] = 0;
01025             if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
01026             {
01027                 // found a meta tag...
01028                 const char* end = ptr;
01029                 while(*end != '>' && *end != '\0' && end<pEnd)
01030                     end++;
01031                 //if ( *end == '\0' ) break;
01032                 QByteArray str( ptr, (end-ptr)+1);
01033                 str = str.toLower();
01034                 int pos=0;
01035                         //if( (pos = str.find("http-equiv", pos)) == -1) break;
01036                         //if( (pos = str.find("content-type", pos)) == -1) break;
01037                 if( (pos = str.indexOf("charset")) == -1)
01038                     continue;
01039                 pos+=6;
01040                 // skip to '='
01041                 if( (pos = str.indexOf("=", pos)) == -1)
01042                     continue;
01043 
01044                 // skip '='
01045                 ++pos;
01046 
01047                 // skip whitespace before encoding itself
01048                 while (pos < (int)str.length() && str[pos] <= ' ')
01049                     ++pos;
01050                 
01051                 // there may also be an opening quote, if this is a charset= and not 
01052                 // a http-equiv.
01053                 if (pos < (int)str.length() && str[pos] == '"')
01054                     ++pos;
01055 
01056                 if ( pos == (int)str.length())
01057                     continue;
01058 
01059                 int endpos = pos;
01060                 while( endpos < str.length() &&
01061                         (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
01062                                     && str[endpos] != ';' && str[endpos] != '>') )
01063                     ++endpos;
01064     #ifdef DECODE_DEBUG
01065                 kDebug( 6005 ) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
01066     #endif
01067                 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
01068                     return true;
01069             }
01070             else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
01071             {
01072                 d->m_seenBody=true;
01073                 break;
01074             }
01075         }
01076     }
01077 
01078     if (len<20)
01079         return false;
01080 
01081 #ifdef DECODE_DEBUG
01082     kDebug( 6005 ) << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
01083 #endif
01084 
01085     switch ( d->m_autoDetectLanguage)
01086     {
01087         case KEncodingDetector::Arabic:
01088             return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01089 //             break;
01090         case KEncodingDetector::Baltic:
01091             return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01092 //             break;
01093         case KEncodingDetector::CentralEuropean:
01094             return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
01095             break;
01096         case KEncodingDetector::Cyrillic:
01097             return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
01098 //             break;
01099         case KEncodingDetector::Greek:
01100             return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
01101 //             break;
01102         case KEncodingDetector::Hebrew:
01103             return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
01104 //             break;
01105         case KEncodingDetector::Japanese:
01106             return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
01107 //             break;
01108         case KEncodingDetector::Turkish:
01109             return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
01110 //             break;
01111         case KEncodingDetector::WesternEuropean:
01112             if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
01113                 return true;
01114             else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml
01115             {
01116                 return setEncoding("iso-8859-15",AutoDetectedEncoding);
01117             }
01118             else //use default provided by eg katepart
01119             {
01120                 return setEncoding("",DefaultEncoding);
01121             }
01122 //             break;
01123         case KEncodingDetector::SemiautomaticDetection:
01124         case KEncodingDetector::ChineseSimplified:
01125         case KEncodingDetector::ChineseTraditional:
01126         case KEncodingDetector::Korean:
01127         case KEncodingDetector::Thai:
01128         case KEncodingDetector::Unicode:
01129         case KEncodingDetector::NorthernSaami:
01130         case KEncodingDetector::SouthEasternEurope:
01131         case KEncodingDetector::None:
01132             // huh. somethings broken in this code ### FIXME
01133             //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
01134             break;
01135     }
01136 
01137     return true;
01138 }
01139 
01140 
01141 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString& lang)
01142 {
01143     if (lang.isEmpty())
01144         return KEncodingDetector::None;
01145     else if (lang==i18nc("@item Text character set", "Unicode"))
01146         return KEncodingDetector::Unicode;
01147     else if (lang==i18nc("@item Text character set", "Cyrillic"))
01148         return KEncodingDetector::Cyrillic;
01149     else if (lang==i18nc("@item Text character set", "Western European"))
01150         return KEncodingDetector::WesternEuropean;
01151     else if (lang==i18nc("@item Text character set", "Central European"))
01152         return KEncodingDetector::CentralEuropean;
01153     else if (lang==i18nc("@item Text character set", "Greek"))
01154         return KEncodingDetector::Greek;
01155     else if (lang==i18nc("@item Text character set", "Hebrew"))
01156         return KEncodingDetector::Hebrew;
01157     else if (lang==i18nc("@item Text character set", "Turkish"))
01158         return KEncodingDetector::Turkish;
01159     else if (lang==i18nc("@item Text character set", "Japanese"))
01160         return KEncodingDetector::Japanese;
01161     else if (lang==i18nc("@item Text character set", "Baltic"))
01162         return KEncodingDetector::Baltic;
01163     else if (lang==i18nc("@item Text character set", "Arabic"))
01164         return KEncodingDetector::Arabic;
01165 
01166     return KEncodingDetector::None;
01167 }
01168 
01169 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)
01170 {
01171     switch (script)
01172     {
01173         case KEncodingDetector::Arabic:
01174             return true;
01175         case KEncodingDetector::Baltic:
01176             return true;
01177         case KEncodingDetector::CentralEuropean:
01178             return true;
01179         case KEncodingDetector::Cyrillic:
01180             return true;
01181         case KEncodingDetector::Greek:
01182             return true;
01183         case KEncodingDetector::Hebrew:
01184             return true;
01185         case KEncodingDetector::Japanese:
01186             return true;
01187         case KEncodingDetector::Turkish:
01188             return true;
01189         case KEncodingDetector::WesternEuropean:
01190             return true;
01191         case KEncodingDetector::ChineseTraditional:
01192             return true;
01193         case KEncodingDetector::ChineseSimplified:
01194             return true;
01195         case KEncodingDetector::Unicode:
01196             return true;
01197             break;
01198         default:
01199             return false;
01200     }
01201 }
01202 
01203 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script)
01204 {
01205     switch (script)
01206     {
01207         case KEncodingDetector::Arabic:
01208             return i18nc("@item Text character set", "Arabic");
01209             break;
01210         case KEncodingDetector::Baltic:
01211             return i18nc("@item Text character set", "Baltic");
01212             break;
01213         case KEncodingDetector::CentralEuropean:
01214             return i18nc("@item Text character set", "Central European");
01215             break;
01216         case KEncodingDetector::Cyrillic:
01217             return i18nc("@item Text character set", "Cyrillic");
01218             break;
01219         case KEncodingDetector::Greek:
01220             return i18nc("@item Text character set", "Greek");
01221             break;
01222         case KEncodingDetector::Hebrew:
01223             return i18nc("@item Text character set", "Hebrew");
01224             break;
01225         case KEncodingDetector::Japanese:
01226             return i18nc("@item Text character set", "Japanese");
01227             break;
01228         case KEncodingDetector::Turkish:
01229             return i18nc("@item Text character set", "Turkish");
01230             break;
01231         case KEncodingDetector::WesternEuropean:
01232             return i18nc("@item Text character set", "Western European");
01233             break;
01234         case KEncodingDetector::ChineseTraditional:
01235             return i18nc("@item Text character set", "Chinese Traditional");
01236             break;
01237         case KEncodingDetector::ChineseSimplified:
01238             return i18nc("@item Text character set", "Chinese Simplified");
01239             break;
01240         case KEncodingDetector::Korean:
01241             return i18nc("@item Text character set", "Korean");
01242             break;
01243         case KEncodingDetector::Thai:
01244             return i18nc("@item Text character set", "Thai");
01245             break;
01246         case KEncodingDetector::Unicode:
01247             return i18nc("@item Text character set", "Unicode");
01248             break;
01249         //case KEncodingDetector::SemiautomaticDetection:
01250         default:
01251             return QString();
01252 
01253         }
01254 }
01255 
01256 #undef DECODE_DEBUG
01257 

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.6.1
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal