View file humhub-1.1.2/protected/vendor/nqxcode/zendsearch/library/ZendSearch/Lucene/Index/DictionaryLoader.php

File size: 9.17Kb
<?php
/**
 * Zend Framework (http://framework.zend.com/)
 *
 * @link      http://github.com/zendframework/zf2 for the canonical source repository
 * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 * @license   http://framework.zend.com/license/new-bsd New BSD License
 * @package   Zend_Search
 */

namespace ZendSearch\Lucene\Index;

use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\InvalidFileFormatException;

/**
 * Dictionary loader
 *
 * It's a dummy class which is created to encapsulate non-good structured code.
 * Manual "method inlining" is performed to increase dictionary index loading operation
 * which is major bottelneck for search performance.
 *
 * @category   Zend
 * @package    Zend_Search_Lucene
 * @subpackage Index
 */
class DictionaryLoader
{
    /**
     * Dictionary index loader.
     *
     * It takes a string which is actually <segment_name>.tii index file data and
     * returns two arrays - term and tremInfo lists.
     *
     * See Zend_Search_Lucene_Index_SegmintInfo class for details
     *
     * @param string $data
     * @return array
     * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException
     */
    public static function load($data)
    {
        $termDictionary = array();
        $termInfos      = array();
        $pos = 0;

        // $tiVersion = $tiiFile->readInt();
        $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8  | ord($data[3]);
        $pos += 4;
        if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
            $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
                throw new InvalidFileFormatException('Wrong TermInfoIndexFile file format');
        }

        // $indexTermCount = $tiiFile->readLong();
        if (PHP_INT_SIZE > 4) {
            $indexTermCount = ord($data[$pos]) << 56  |
                              ord($data[$pos+1]) << 48  |
                              ord($data[$pos+2]) << 40  |
                              ord($data[$pos+3]) << 32  |
                              ord($data[$pos+4]) << 24  |
                              ord($data[$pos+5]) << 16  |
                              ord($data[$pos+6]) << 8   |
                              ord($data[$pos+7]);
        } else {
            if ((ord($data[$pos])            != 0) ||
                (ord($data[$pos+1])          != 0) ||
                (ord($data[$pos+2])          != 0) ||
                (ord($data[$pos+3])          != 0) ||
                ((ord($data[$pos+4]) & 0x80) != 0)) {
                    throw new InvalidFileFormatException('Largest supported segment size (for 32-bit mode) is 2Gb');
                 }

            $indexTermCount = ord($data[$pos+4]) << 24  |
                              ord($data[$pos+5]) << 16  |
                              ord($data[$pos+6]) << 8   |
                              ord($data[$pos+7]);
        }
        $pos += 8;

        //                  $tiiFile->readInt();  // IndexInterval
        $pos += 4;

        // $skipInterval   = $tiiFile->readInt();
        $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8  | ord($data[$pos+3]);
        $pos += 4;
        if ($indexTermCount < 1) {
            throw new InvalidFileFormatException('Wrong number of terms in a term dictionary index');
        }

        if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
            /* Skip MaxSkipLevels value */
            $pos += 4;
        }

        $prevTerm     = '';
        $freqPointer  =  0;
        $proxPointer  =  0;
        $indexPointer =  0;
        for ($count = 0; $count < $indexTermCount; $count++) {
            //$termPrefixLength = $tiiFile->readVInt();
            $nbyte = ord($data[$pos++]);
            $termPrefixLength = $nbyte & 0x7F;
            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
                $nbyte = ord($data[$pos++]);
                $termPrefixLength |= ($nbyte & 0x7F) << $shift;
            }

            // $termSuffix       = $tiiFile->readString();
            $nbyte = ord($data[$pos++]);
            $len = $nbyte & 0x7F;
            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
                $nbyte = ord($data[$pos++]);
                $len |= ($nbyte & 0x7F) << $shift;
            }
            if ($len == 0) {
                $termSuffix = '';
            } else {
                $termSuffix = substr($data, $pos, $len);
                $pos += $len;
                for ($count1 = 0; $count1 < $len; $count1++ ) {
                    if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
                        $addBytes = 1;
                        if (ord($termSuffix[$count1]) & 0x20 ) {
                            $addBytes++;

                            // Never used for Java Lucene created index.
                            // Java2 doesn't encode strings in four bytes
                            if (ord($termSuffix[$count1]) & 0x10 ) {
                                $addBytes++;
                            }
                        }
                        $termSuffix .= substr($data, $pos, $addBytes);
                        $pos += $addBytes;
                        $len += $addBytes;

                        // Check for null character. Java2 encodes null character
                        // in two bytes.
                        if (ord($termSuffix[$count1]) == 0xC0 &&
                            ord($termSuffix[$count1+1]) == 0x80   ) {
                            $termSuffix[$count1] = 0;
                            $termSuffix = substr($termSuffix,0,$count1+1)
                                        . substr($termSuffix,$count1+2);
                        }
                        $count1 += $addBytes;
                    }
                }
            }

            $pb = 0; $pc = 0;
            while ($pb < strlen($prevTerm)  &&  $pc < $termPrefixLength) {
                $charBytes = 1;
                if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
                    $charBytes++;
                    if (ord($prevTerm[$pb]) & 0x20 ) {
                        $charBytes++;
                        if (ord($prevTerm[$pb]) & 0x10 ) {
                            $charBytes++;
                        }
                    }
                }

                if ($pb + $charBytes > strlen($data)) {
                    // wrong character
                    break;
                }

                $pc++;
                $pb += $charBytes;
            }
            $termValue = substr($prevTerm, 0, $pb) . $termSuffix;

            // $termFieldNum     = $tiiFile->readVInt();
            $nbyte = ord($data[$pos++]);
            $termFieldNum = $nbyte & 0x7F;
            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
                $nbyte = ord($data[$pos++]);
                $termFieldNum |= ($nbyte & 0x7F) << $shift;
            }

            // $docFreq          = $tiiFile->readVInt();
            $nbyte = ord($data[$pos++]);
            $docFreq = $nbyte & 0x7F;
            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
                $nbyte = ord($data[$pos++]);
                $docFreq |= ($nbyte & 0x7F) << $shift;
            }

            // $freqPointer     += $tiiFile->readVInt();
            $nbyte = ord($data[$pos++]);
            $vint = $nbyte & 0x7F;
            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
                $nbyte = ord($data[$pos++]);
                $vint |= ($nbyte & 0x7F) << $shift;
            }
            $freqPointer += $vint;

            // $proxPointer     += $tiiFile->readVInt();
            $nbyte = ord($data[$pos++]);
            $vint = $nbyte & 0x7F;
            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
                $nbyte = ord($data[$pos++]);
                $vint |= ($nbyte & 0x7F) << $shift;
            }
            $proxPointer += $vint;

            if( $docFreq >= $skipInterval ) {
                // $skipDelta = $tiiFile->readVInt();
                $nbyte = ord($data[$pos++]);
                $vint = $nbyte & 0x7F;
                for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
                    $nbyte = ord($data[$pos++]);
                    $vint |= ($nbyte & 0x7F) << $shift;
                }
                $skipDelta = $vint;
            } else {
                $skipDelta = 0;
            }

            // $indexPointer += $tiiFile->readVInt();
            $nbyte = ord($data[$pos++]);
            $vint = $nbyte & 0x7F;
            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
                $nbyte = ord($data[$pos++]);
                $vint |= ($nbyte & 0x7F) << $shift;
            }
            $indexPointer += $vint;


            $termDictionary[] = array($termFieldNum, $termValue);

            $termInfos[] =
                 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);

            $prevTerm = $termValue;
        }

        // Check special index entry mark
        if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
            throw new InvalidFileFormatException('Wrong TermInfoIndexFile file format');
        }

        if (PHP_INT_SIZE > 4) {
            // Treat 64-bit 0xFFFFFFFF as -1
            $termDictionary[0][0] = -1;
        }

        return array($termDictionary, $termInfos);
    }
}