View file humhub-1.1.2/protected/vendor/nqxcode/zendsearch/tests/ZendSearch/Lucene/AnalysisTest.php

File size: 14.96Kb
<?php
/**
 * Zend Framework (http://framework.zend.com/)
 *
 * @link      http://github.com/zendframework/zf2 for the canonical source repository
 * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 * @license   http://framework.zend.com/license/new-bsd New BSD License
 * @package   Zend_Search
 */

namespace ZendSearchTest\Lucene;

use ZendSearch\Lucene\Analysis\Analyzer\Analyzer;
use ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface;
use ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene\Analysis\Analyzer\Common\Text;
use ZendSearch\Lucene\Analysis\Analyzer\Common\TextNum;
use ZendSearch\Lucene\Analysis\Analyzer\Common\Utf8;
use ZendSearch\Lucene\Analysis\Analyzer\Common\Utf8Num;

/**
 * @category   Zend
 * @package    Zend_Search_Lucene
 * @subpackage UnitTests
 * @group      Zend_Search_Lucene
 */
class AnalysisTest extends \PHPUnit_Framework_TestCase
{
    public function testAnalyzer()
    {
        $currentAnalyzer = Analyzer::getDefault();
        $this->assertTrue($currentAnalyzer instanceof AnalyzerInterface);

        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */

        $newAnalyzer = new Common\Utf8Num();
        Analyzer::setDefault($newAnalyzer);
        $this->assertTrue(Analyzer::getDefault() === $newAnalyzer);

        // Set analyzer to the default value (used in other tests)
        Analyzer::setDefault($currentAnalyzer);
    }

    public function testText()
    {
        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */

        $analyzer = new Common\Text();

        $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'Word');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         4);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'Word');
        $this->assertEquals($tokenList[1]->getStartOffset(),       6);
        $this->assertEquals($tokenList[1]->getEndOffset(),         10);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'anotherWord');
        $this->assertEquals($tokenList[2]->getStartOffset(),       12);
        $this->assertEquals($tokenList[2]->getEndOffset(),         23);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testTextCaseInsensitive()
    {
        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */

        $analyzer = new Text\CaseInsensitive();

        $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'word');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         4);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'word');
        $this->assertEquals($tokenList[1]->getStartOffset(),       6);
        $this->assertEquals($tokenList[1]->getEndOffset(),         10);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'anotherword');
        $this->assertEquals($tokenList[2]->getStartOffset(),       12);
        $this->assertEquals($tokenList[2]->getEndOffset(),         23);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testTextNum()
    {
        /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */

        $analyzer = new Common\TextNum();

        $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'Word1');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         5);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'Word2');
        $this->assertEquals($tokenList[1]->getStartOffset(),       6);
        $this->assertEquals($tokenList[1]->getEndOffset(),         11);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'anotherWord');
        $this->assertEquals($tokenList[2]->getStartOffset(),       12);
        $this->assertEquals($tokenList[2]->getEndOffset(),         23);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testTextNumCaseInsensitive()
    {
        /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */

        $analyzer = new TextNum\CaseInsensitive();

        $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'word1');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         5);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'word2');
        $this->assertEquals($tokenList[1]->getStartOffset(),       6);
        $this->assertEquals($tokenList[1]->getEndOffset(),         11);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'anotherword');
        $this->assertEquals($tokenList[2]->getStartOffset(),       12);
        $this->assertEquals($tokenList[2]->getEndOffset(),         23);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testUtf8()
    {
        if (@preg_match('/\pL/u', 'a') != 1) {
            // PCRE unicode support is turned off
            return;
        }

        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */

        $analyzer = new Common\Utf8();

        // UTF-8 text with a cyrillic symbols
        $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'Слово');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         5);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'Слово');
        $this->assertEquals($tokenList[1]->getStartOffset(),       7);
        $this->assertEquals($tokenList[1]->getEndOffset(),         12);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'ДругоеСлово');
        $this->assertEquals($tokenList[2]->getStartOffset(),       14);
        $this->assertEquals($tokenList[2]->getEndOffset(),         25);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testUtf8Num()
    {
        if (@preg_match('/\pL/u', 'a') != 1) {
            // PCRE unicode support is turned off
            return;
        }

        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */

        $analyzer = new Common\Utf8Num();

        // UTF-8 text with a cyrillic symbols
        $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'Слово1');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         6);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'Слово2');
        $this->assertEquals($tokenList[1]->getStartOffset(),       7);
        $this->assertEquals($tokenList[1]->getEndOffset(),         13);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'ДругоеСлово');
        $this->assertEquals($tokenList[2]->getStartOffset(),       14);
        $this->assertEquals($tokenList[2]->getEndOffset(),         25);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testUtf8CaseInsensitive()
    {
        if (@preg_match('/\pL/u', 'a') != 1) {
            // PCRE unicode support is turned off
            return;
        }
        if (!function_exists('mb_strtolower')) {
            // mbstring extension is disabled
            return;
        }

        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */

        $analyzer = new Utf8\CaseInsensitive();

        // UTF-8 text with a cyrillic symbols
        $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'слово');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         5);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'слово');
        $this->assertEquals($tokenList[1]->getStartOffset(),       7);
        $this->assertEquals($tokenList[1]->getEndOffset(),         12);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'другоеслово');
        $this->assertEquals($tokenList[2]->getStartOffset(),       14);
        $this->assertEquals($tokenList[2]->getEndOffset(),         25);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testUtf8NumCaseInsensitive()
    {
        if (@preg_match('/\pL/u', 'a') != 1) {
            // PCRE unicode support is turned off
            return;
        }
        if (!function_exists('mb_strtolower')) {
            // mbstring extension is disabled
            return;
        }

        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
        $analyzer = new Utf8Num\CaseInsensitive();

        // UTF-8 text with a cyrillic symbols
        $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'слово1');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         6);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'слово2');
        $this->assertEquals($tokenList[1]->getStartOffset(),       7);
        $this->assertEquals($tokenList[1]->getEndOffset(),         13);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'другоеслово');
        $this->assertEquals($tokenList[2]->getStartOffset(),       14);
        $this->assertEquals($tokenList[2]->getEndOffset(),         25);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testEncoding()
    {
        if (PHP_OS == 'AIX') {
            $this->markTestSkipped('Test not available on AIX');
        }

        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */

        $analyzer = new Common\Utf8();

        // UTF-8 text with a cyrillic symbols
        $tokenList = $analyzer->tokenize(iconv('UTF-8', 'Windows-1251', 'Слово1 Слово2 ДругоеСлово'), 'Windows-1251');

        $this->assertEquals(count($tokenList), 3);

        $this->assertEquals($tokenList[0]->getTermText(),         'Слово');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         5);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'Слово');
        $this->assertEquals($tokenList[1]->getStartOffset(),       7);
        $this->assertEquals($tokenList[1]->getEndOffset(),         12);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[2]->getTermText(),         'ДругоеСлово');
        $this->assertEquals($tokenList[2]->getStartOffset(),       14);
        $this->assertEquals($tokenList[2]->getEndOffset(),         25);
        $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
    }

    public function testStopWords()
    {
        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */

        /** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */

        $analyzer = new Text\CaseInsensitive();
        $stopWordsFilter = new \ZendSearch\Lucene\Analysis\TokenFilter\StopWords(array('word', 'and', 'or'));

        $analyzer->addFilter($stopWordsFilter);

        $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');

        $this->assertEquals(count($tokenList), 1);

        $this->assertEquals($tokenList[0]->getTermText(),         'anotherword');
        $this->assertEquals($tokenList[0]->getStartOffset(),       12);
        $this->assertEquals($tokenList[0]->getEndOffset(),         23);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
    }

    public function testShortWords()
    {
        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */

        /** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */

        $analyzer = new Text\CaseInsensitive();
        $stopWordsFilter = new \ZendSearch\Lucene\Analysis\TokenFilter\ShortWords(4 /* Minimal length */);

        $analyzer->addFilter($stopWordsFilter);

        $tokenList = $analyzer->tokenize('Word1 and anotherWord');

        $this->assertEquals(count($tokenList), 2);

        $this->assertEquals($tokenList[0]->getTermText(),         'word');
        $this->assertEquals($tokenList[0]->getStartOffset(),       0);
        $this->assertEquals($tokenList[0]->getEndOffset(),         4);
        $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);

        $this->assertEquals($tokenList[1]->getTermText(),         'anotherword');
        $this->assertEquals($tokenList[1]->getStartOffset(),       10);
        $this->assertEquals($tokenList[1]->getEndOffset(),         21);
        $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
    }
}