Current File : /home/k/a/r/karenpetzb/www/items/category/Index.zip |
PK =hH[��T T Term.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* A Term represents a word from text. This is the unit of search. It is
* composed of two elements, the text of the word, as a string, and the name of
* the field that the text occured in, an interned string.
*
* Note that terms may represent more than words from text fields, but also
* things like dates, email addresses, urls, etc.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Term
{
/**
* Field name or field number (depending from context)
*
* @var mixed
*/
public $field;
/**
* Term value
*
* @var string
*/
public $text;
/**
* Object constructor
*/
public function __construct($text, $field = null)
{
$this->field = ($field === null)? Zend_Search_Lucene::getDefaultSearchField() : $field;
$this->text = $text;
}
/**
* Returns term key
*
* @return string
*/
public function key()
{
return $this->field . chr(0) . $this->text;
}
/**
* Get term prefix
*
* @param string $str
* @param integer $length
* @return string
*/
public static function getPrefix($str, $length)
{
$prefixBytes = 0;
$prefixChars = 0;
while ($prefixBytes < strlen($str) && $prefixChars < $length) {
$charBytes = 1;
if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($str[$prefixBytes]) & 0x20 ) {
$charBytes++;
if (ord($str[$prefixBytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($prefixBytes + $charBytes > strlen($str)) {
// wrong character
break;
}
$prefixChars++;
$prefixBytes += $charBytes;
}
return substr($str, 0, $prefixBytes);
}
/**
* Get UTF-8 string length
*
* @param string $str
* @return string
*/
public static function getLength($str)
{
$bytes = 0;
$chars = 0;
while ($bytes < strlen($str)) {
$charBytes = 1;
if ((ord($str[$bytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($str[$bytes]) & 0x20 ) {
$charBytes++;
if (ord($str[$bytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($bytes + $charBytes > strlen($str)) {
// wrong character
break;
}
$chars++;
$bytes += $charBytes;
}
return $chars;
}
}
PK =hH[χ�m� � SegmentInfo.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_DictionaryLoader */
require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php';
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_LockManager */
require_once 'Zend/Search/Lucene/LockManager.php';
/** Zend_Search_Lucene_Index_DocsFilter */
require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentInfo
{
/**
* "Full scan vs fetch" boundary.
*
* If filter selectivity is less than this value, then full scan is performed
* (since term entries fetching has some additional overhead).
*/
const FULL_SCAN_VS_FETCH_BOUNDARY = 5;
/**
* Number of docs in a segment
*
* @var integer
*/
private $_docCount;
/**
* Segment name
*
* @var string
*/
private $_name;
/**
* Term Dictionary Index
*
* Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
* of performance considerations)
* [0] -> $termValue
* [1] -> $termFieldNum
*
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
*/
private $_termDictionary;
/**
* Term Dictionary Index TermInfos
*
* Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
* of performance considerations)
* [0] -> $docFreq
* [1] -> $freqPointer
* [2] -> $proxPointer
* [3] -> $skipOffset
* [4] -> $indexPointer
*
* @var array
*/
private $_termDictionaryInfos;
/**
* Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
*
* @var array
*/
private $_fields;
/**
* Field positions in a dictionary.
* (Term dictionary contains filelds ordered by names)
*
* @var array
*/
private $_fieldsDicPositions;
/**
* Associative array where the key is the file name and the value is data offset
* in a compound segment file (.csf).
*
* @var array
*/
private $_segFiles;
/**
* Associative array where the key is the file name and the value is file size (.csf).
*
* @var array
*/
private $_segFileSizes;
/**
* Delete file generation number
*
* -2 means autodetect latest delete generation
* -1 means 'there is no delete file'
* 0 means pre-2.1 format delete file
* X specifies used delete file
*
* @var integer
*/
private $_delGen;
/**
* Segment has single norms file
*
* If true then one .nrm file is used for all fields
* Otherwise .fN files are used
*
* @var boolean
*/
private $_hasSingleNormFile;
/**
* Use compound segment file (*.cfs) to collect all other segment files
* (excluding .del files)
*
* @var boolean
*/
private $_isCompound;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory_Filesystem
*/
private $_directory;
/**
* Normalization factors.
* An array fieldName => normVector
* normVector is a binary string.
* Each byte corresponds to an indexed document in a segment and
* encodes normalization factor (float value, encoded by
* Zend_Search_Lucene_Search_Similarity::encodeNorm())
*
* @var array
*/
private $_norms = array();
/**
* List of deleted documents.
* bitset if bitset extension is loaded or array otherwise.
*
* @var mixed
*/
private $_deleted = null;
/**
* $this->_deleted update flag
*
* @var boolean
*/
private $_deletedDirty = false;
/**
* True if segment uses shared doc store
*
* @var boolean
*/
private $_usesSharedDocStore;
/*
* Shared doc store options.
* It's an assotiative array with the following items:
* - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin
* - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files.
* - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file).
*/
private $_sharedDocStoreOptions;
/**
* Zend_Search_Lucene_Index_SegmentInfo constructor
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
* @param integer $docCount
* @param integer $delGen
* @param array|null $docStoreOptions
* @param boolean $hasSingleNormFile
* @param boolean $isCompound
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null)
{
$this->_directory = $directory;
$this->_name = $name;
$this->_docCount = $docCount;
if ($docStoreOptions !== null) {
$this->_usesSharedDocStore = true;
$this->_sharedDocStoreOptions = $docStoreOptions;
if ($docStoreOptions['isCompound']) {
$cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx');
$cfxFilesCount = $cfxFile->readVInt();
$cfxFiles = array();
$cfxFileSizes = array();
for ($count = 0; $count < $cfxFilesCount; $count++) {
$dataOffset = $cfxFile->readLong();
if ($count != 0) {
$cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles);
}
$fileName = $cfxFile->readString();
$cfxFiles[$fileName] = $dataOffset;
}
if ($count != 0) {
$cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset;
}
$this->_sharedDocStoreOptions['files'] = $cfxFiles;
$this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes;
}
}
$this->_hasSingleNormFile = $hasSingleNormFile;
$this->_delGen = $delGen;
$this->_termDictionary = null;
if ($isCompound !== null) {
$this->_isCompound = $isCompound;
} else {
// It's a pre-2.1 segment or isCompound is set to 'unknown'
// Detect if segment uses compound file
try {
// Try to open compound file
$this->_directory->getFileObject($name . '.cfs');
// Compound file is found
$this->_isCompound = true;
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') !== false) {
// Compound file is not found or is not readable
$this->_isCompound = false;
} else {
throw $e;
}
}
}
$this->_segFiles = array();
if ($this->_isCompound) {
$cfsFile = $this->_directory->getFileObject($name . '.cfs');
$segFilesCount = $cfsFile->readVInt();
for ($count = 0; $count < $segFilesCount; $count++) {
$dataOffset = $cfsFile->readLong();
if ($count != 0) {
$this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
}
$fileName = $cfsFile->readString();
$this->_segFiles[$fileName] = $dataOffset;
}
if ($count != 0) {
$this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
}
}
$fnmFile = $this->openCompoundFile('.fnm');
$fieldsCount = $fnmFile->readVInt();
$fieldNames = array();
$fieldNums = array();
$this->_fields = array();
for ($count=0; $count < $fieldsCount; $count++) {
$fieldName = $fnmFile->readString();
$fieldBits = $fnmFile->readByte();
$this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
$fieldBits & 0x01 /* field is indexed */,
$count,
$fieldBits & 0x02 /* termvectors are stored */,
$fieldBits & 0x10 /* norms are omitted */,
$fieldBits & 0x20 /* payloads are stored */);
if ($fieldBits & 0x10) {
// norms are omitted for the indexed field
$this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
}
$fieldNums[$count] = $count;
$fieldNames[$count] = $fieldName;
}
array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
$this->_fieldsDicPositions = array_flip($fieldNums);
if ($this->_delGen == -2) {
$this->_detectLatestDelGen();
}
if ($this->_delGen == -1) {
// There is no delete file for this segment
// Do nothing
} else if ($this->_delGen == 0) {
// It's a segment with pre-2.1 format delete file
// Try to find delete file
try {
// '.del' files always stored in a separate file
// Segment compound is not used
$delFile = $this->_directory->getFileObject($this->_name . '.del');
$byteCount = $delFile->readInt();
$byteCount = ceil($byteCount/8);
$bitCount = $delFile->readInt();
if ($bitCount == 0) {
$delBytes = '';
} else {
$delBytes = $delFile->readBytes($byteCount);
}
if (extension_loaded('bitset')) {
$this->_deleted = $delBytes;
} else {
$this->_deleted = array();
for ($count = 0; $count < $byteCount; $count++) {
$byte = ord($delBytes[$count]);
for ($bit = 0; $bit < 8; $bit++) {
if ($byte & (1<<$bit)) {
$this->_deleted[$count*8 + $bit] = 1;
}
}
}
}
} catch(Zend_Search_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') === false ) {
throw $e;
}
// There is no delete file
// Do nothing
}
} else {
// It's 2.1+ format delete file
$delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
$format = $delFile->readInt();
if ($format == (int)0xFFFFFFFF) {
if (extension_loaded('bitset')) {
$this->_deleted = bitset_empty();
} else {
$this->_deleted = array();
}
$byteCount = $delFile->readInt();
$bitCount = $delFile->readInt();
$delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
$byteNum = 0;
do {
$dgap = $delFile->readVInt();
$nonZeroByte = $delFile->readByte();
$byteNum += $dgap;
for ($bit = 0; $bit < 8; $bit++) {
if ($nonZeroByte & (1<<$bit)) {
if (extension_loaded('bitset')) {
bitset_incl($this->_deleted, $byteNum*8 + $bit);
} else {
$this->_deleted[$byteNum*8 + $bit] = 1;
}
}
}
} while ($delFile->tell() < $delFileSize);
} else {
// $format is actually byte count
$byteCount = ceil($format/8);
$bitCount = $delFile->readInt();
if ($bitCount == 0) {
$delBytes = '';
} else {
$delBytes = $delFile->readBytes($byteCount);
}
if (extension_loaded('bitset')) {
$this->_deleted = $delBytes;
} else {
$this->_deleted = array();
for ($count = 0; $count < $byteCount; $count++) {
$byte = ord($delBytes[$count]);
for ($bit = 0; $bit < 8; $bit++) {
if ($byte & (1<<$bit)) {
$this->_deleted[$count*8 + $bit] = 1;
}
}
}
}
}
}
}
/**
* Opens index file stoted within compound index file
*
* @param string $extension
* @param boolean $shareHandler
* @throws Zend_Search_Lucene_Exception
* @return Zend_Search_Lucene_Storage_File
*/
public function openCompoundFile($extension, $shareHandler = true)
{
if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
$fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx';
$fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt';
if (!$this->_sharedDocStoreOptions['isCompound']) {
$fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler);
$fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
if ($extension == '.fdx') {
// '.fdx' file is requested
return $fdxFile;
} else {
// '.fdt' file is requested
$fdtStartOffset = $fdxFile->readLong();
$fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler);
$fdtFile->seek($fdtStartOffset, SEEK_CUR);
return $fdtFile;
}
}
if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) {
throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
. $fdxFName . ' file.' );
}
if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) {
throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
. $fdtFName . ' file.' );
}
// Open shared docstore segment file
$cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler);
// Seek to the start of '.fdx' file within compound file
$cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]);
// Seek to the start of current segment documents section
$cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
if ($extension == '.fdx') {
// '.fdx' file is requested
return $cfxFile;
} else {
// '.fdt' file is requested
$fdtStartOffset = $cfxFile->readLong();
// Seek to the start of '.fdt' file within compound file
$cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]);
// Seek to the start of current segment documents section
$cfxFile->seek($fdtStartOffset, SEEK_CUR);
return $fdtFile;
}
}
$filename = $this->_name . $extension;
if (!$this->_isCompound) {
return $this->_directory->getFileObject($filename, $shareHandler);
}
if( !isset($this->_segFiles[$filename]) ) {
throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain '
. $filename . ' file.' );
}
$file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
$file->seek($this->_segFiles[$filename]);
return $file;
}
/**
* Get compound file length
*
* @param string $extension
* @return integer
*/
public function compoundFileLength($extension)
{
if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
$filename = $this->_sharedDocStoreOptions['segment'] . $extension;
if (!$this->_sharedDocStoreOptions['isCompound']) {
return $this->_directory->fileLength($filename);
}
if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) {
throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain '
. $filename . ' file.' );
}
return $this->_sharedDocStoreOptions['fileSizes'][$filename];
}
$filename = $this->_name . $extension;
// Try to get common file first
if ($this->_directory->fileExists($filename)) {
return $this->_directory->fileLength($filename);
}
if( !isset($this->_segFileSizes[$filename]) ) {
throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
. $filename . ' file.' );
}
return $this->_segFileSizes[$filename];
}
/**
* Returns field index or -1 if field is not found
*
* @param string $fieldName
* @return integer
*/
public function getFieldNum($fieldName)
{
foreach( $this->_fields as $field ) {
if( $field->name == $fieldName ) {
return $field->number;
}
}
return -1;
}
/**
* Returns field info for specified field
*
* @param integer $fieldNum
* @return Zend_Search_Lucene_Index_FieldInfo
*/
public function getField($fieldNum)
{
return $this->_fields[$fieldNum];
}
/**
* Returns array of fields.
* if $indexed parameter is true, then returns only indexed fields.
*
* @param boolean $indexed
* @return array
*/
public function getFields($indexed = false)
{
$result = array();
foreach( $this->_fields as $field ) {
if( (!$indexed) || $field->isIndexed ) {
$result[ $field->name ] = $field->name;
}
}
return $result;
}
/**
* Returns array of FieldInfo objects.
*
* @return array
*/
public function getFieldInfos()
{
return $this->_fields;
}
/**
* Returns actual deletions file generation number.
*
* @return integer
*/
public function getDelGen()
{
return $this->_delGen;
}
/**
* Returns the total number of documents in this segment (including deleted documents).
*
* @return integer
*/
public function count()
{
return $this->_docCount;
}
/**
* Returns number of deleted documents.
*
* @return integer
*/
private function _deletedCount()
{
if ($this->_deleted === null) {
return 0;
}
if (extension_loaded('bitset')) {
return count(bitset_to_array($this->_deleted));
} else {
return count($this->_deleted);
}
}
/**
* Returns the total number of non-deleted documents in this segment.
*
* @return integer
*/
public function numDocs()
{
if ($this->hasDeletions()) {
return $this->_docCount - $this->_deletedCount();
} else {
return $this->_docCount;
}
}
/**
* Get field position in a fields dictionary
*
* @param integer $fieldNum
* @return integer
*/
private function _getFieldPosition($fieldNum) {
// Treat values which are not in a translation table as a 'direct value'
return isset($this->_fieldsDicPositions[$fieldNum]) ?
$this->_fieldsDicPositions[$fieldNum] : $fieldNum;
}
/**
* Return segment name
*
* @return string
*/
public function getName()
{
return $this->_name;
}
/**
* TermInfo cache
*
* Size is 1024.
* Numbers are used instead of class constants because of performance considerations
*
* @var array
*/
private $_termInfoCache = array();
private function _cleanUpTermInfoCache()
{
// Clean 256 term infos
foreach ($this->_termInfoCache as $key => $termInfo) {
unset($this->_termInfoCache[$key]);
// leave 768 last used term infos
if (count($this->_termInfoCache) == 768) {
break;
}
}
}
/**
* Load terms dictionary index
*
* @throws Zend_Search_Lucene_Exception
*/
private function _loadDictionaryIndex()
{
// Check, if index is already serialized
if ($this->_directory->fileExists($this->_name . '.sti')) {
// Load serialized dictionary index data
$stiFile = $this->_directory->getFileObject($this->_name . '.sti');
$stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
// Load dictionary index data
if (($unserializedData = @unserialize($stiFileData)) !== false) {
list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData;
return;
}
}
// Load data from .tii file and generate .sti file
// Prefetch dictionary index data
$tiiFile = $this->openCompoundFile('.tii');
$tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
// Load dictionary index data
list($this->_termDictionary, $this->_termDictionaryInfos) =
Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
$stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
$stiFile = $this->_directory->createFile($this->_name . '.sti');
$stiFile->writeBytes($stiFileData);
}
/**
* Scans terms dictionary and returns term info
*
* @param Zend_Search_Lucene_Index_Term $term
* @return Zend_Search_Lucene_Index_TermInfo
*/
public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
{
$termKey = $term->key();
if (isset($this->_termInfoCache[$termKey])) {
$termInfo = $this->_termInfoCache[$termKey];
// Move termInfo to the end of cache
unset($this->_termInfoCache[$termKey]);
$this->_termInfoCache[$termKey] = $termInfo;
return $termInfo;
}
if ($this->_termDictionary === null) {
$this->_loadDictionaryIndex();
}
$searchField = $this->getFieldNum($term->field);
if ($searchField == -1) {
return null;
}
$searchDicField = $this->_getFieldPosition($searchField);
// search for appropriate value in dictionary
$lowIndex = 0;
$highIndex = count($this->_termDictionary)-1;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$midTerm = $this->_termDictionary[$mid];
$fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
$delta = $searchDicField - $fieldNum;
if ($delta == 0) {
$delta = strcmp($term->text, $midTerm[1] /* text */);
}
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
// return $this->_termDictionaryInfos[$mid]; // We got it!
$a = $this->_termDictionaryInfos[$mid];
$termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
// Put loaded termInfo into cache
$this->_termInfoCache[$termKey] = $termInfo;
return $termInfo;
}
}
if ($highIndex == -1) {
// Term is out of the dictionary range
return null;
}
$prevPosition = $highIndex;
$prevTerm = $this->_termDictionary[$prevPosition];
$prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
$tisFile = $this->openCompoundFile('.tis');
$tiVersion = $tisFile->readInt();
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
}
$termCount = $tisFile->readLong();
$indexInterval = $tisFile->readInt();
$skipInterval = $tisFile->readInt();
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
$maxSkipLevels = $tisFile->readInt();
}
$tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR);
$termValue = $prevTerm[1] /* text */;
$termFieldNum = $prevTerm[0] /* field */;
$freqPointer = $prevTermInfo[1] /* freqPointer */;
$proxPointer = $prevTermInfo[2] /* proxPointer */;
for ($count = $prevPosition*$indexInterval + 1;
$count <= $termCount &&
( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
($this->_getFieldPosition($termFieldNum) == $searchDicField &&
strcmp($termValue, $term->text) < 0) );
$count++) {
$termPrefixLength = $tisFile->readVInt();
$termSuffix = $tisFile->readString();
$termFieldNum = $tisFile->readVInt();
$termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
$docFreq = $tisFile->readVInt();
$freqPointer += $tisFile->readVInt();
$proxPointer += $tisFile->readVInt();
if( $docFreq >= $skipInterval ) {
$skipOffset = $tisFile->readVInt();
} else {
$skipOffset = 0;
}
}
if ($termFieldNum == $searchField && $termValue == $term->text) {
$termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
} else {
$termInfo = null;
}
// Put loaded termInfo into cache
$this->_termInfoCache[$termKey] = $termInfo;
if (count($this->_termInfoCache) == 1024) {
$this->_cleanUpTermInfoCache();
}
return $termInfo;
}
/**
* Returns IDs of all the documents containing term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $shift
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
*/
public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
{
$termInfo = $this->getTermInfo($term);
if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
$docsFilter->segmentFilters[$this->_name] = array();
}
return array();
}
$frqFile = $this->openCompoundFile('.frq');
$frqFile->seek($termInfo->freqPointer,SEEK_CUR);
$docId = 0;
$result = array();
if ($docsFilter !== null) {
if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
}
if (isset($docsFilter->segmentFilters[$this->_name])) {
// Filter already has some data for the current segment
// Make short name for the filter (which doesn't need additional dereferencing)
$filter = &$docsFilter->segmentFilters[$this->_name];
// Check if filter is not empty
if (count($filter) == 0) {
return array();
}
if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
// Perform fetching
// ---------------------------------------------------------------
$updatedFilterData = array();
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
if (isset($filter[$docId])) {
$result[] = $shift + $docId;
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
// ---------------------------------------------------------------
} else {
// Perform full scan
$updatedFilterData = array();
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
if (isset($filter[$docId])) {
$result[] = $shift + $docId;
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
}
} else {
// Filter is present, but doesn't has data for the current segment yet
$filterData = array();
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
$result[] = $shift + $docId;
$filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
$docsFilter->segmentFilters[$this->_name] = $filterData;
}
} else {
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
$result[] = $shift + $docId;
}
}
return $result;
}
/**
* Returns term freqs array.
* Result array structure: array(docId => freq, ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $shift
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return Zend_Search_Lucene_Index_TermInfo
*/
public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
{
$termInfo = $this->getTermInfo($term);
if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
$docsFilter->segmentFilters[$this->_name] = array();
}
return array();
}
$frqFile = $this->openCompoundFile('.frq');
$frqFile->seek($termInfo->freqPointer,SEEK_CUR);
$result = array();
$docId = 0;
$result = array();
if ($docsFilter !== null) {
if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
}
if (isset($docsFilter->segmentFilters[$this->_name])) {
// Filter already has some data for the current segment
// Make short name for the filter (which doesn't need additional dereferencing)
$filter = &$docsFilter->segmentFilters[$this->_name];
// Check if filter is not empty
if (count($filter) == 0) {
return array();
}
if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
// Perform fetching
// ---------------------------------------------------------------
$updatedFilterData = array();
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
if (isset($filter[$docId])) {
$result[$shift + $docId] = 1;
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
} else {
$docId += $docDelta/2;
if (isset($filter[$docId])) {
$result[$shift + $docId] = $frqFile->readVInt();
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
// ---------------------------------------------------------------
} else {
// Perform full scan
$updatedFilterData = array();
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
if (isset($filter[$docId])) {
$result[$shift + $docId] = 1;
$updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
}
} else {
$docId += $docDelta/2;
if (isset($filter[$docId])) {
$result[$shift + $docId] = $frqFile->readVInt();
$updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
}
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
}
} else {
// Filter doesn't has data for current segment
$filterData = array();
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$result[$shift + $docId] = 1;
$filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
} else {
$docId += $docDelta/2;
$result[$shift + $docId] = $frqFile->readVInt();
$filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
}
$docsFilter->segmentFilters[$this->_name] = $filterData;
}
} else {
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$result[$shift + $docId] = 1;
} else {
$docId += $docDelta/2;
$result[$shift + $docId] = $frqFile->readVInt();
}
}
}
return $result;
}
/**
* Returns term positions array.
* Result array structure: array(docId => array(pos1, pos2, ...), ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $shift
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return Zend_Search_Lucene_Index_TermInfo
*/
public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
{
$termInfo = $this->getTermInfo($term);
if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
$docsFilter->segmentFilters[$this->_name] = array();
}
return array();
}
$frqFile = $this->openCompoundFile('.frq');
$frqFile->seek($termInfo->freqPointer,SEEK_CUR);
$docId = 0;
$freqs = array();
if ($docsFilter !== null) {
if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
}
if (isset($docsFilter->segmentFilters[$this->_name])) {
// Filter already has some data for the current segment
// Make short name for the filter (which doesn't need additional dereferencing)
$filter = &$docsFilter->segmentFilters[$this->_name];
// Check if filter is not empty
if (count($filter) == 0) {
return array();
}
if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
// Perform fetching
// ---------------------------------------------------------------
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$freqs[$docId] = 1;
} else {
$docId += $docDelta/2;
$freqs[$docId] = $frqFile->readVInt();
}
}
$updatedFilterData = array();
$result = array();
$prxFile = $this->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer, SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$termPosition = 0;
$positions = array();
// we have to read .prx file to get right position for next doc
// even filter doesn't match current document
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
// Include into updated filter and into result only if doc is matched by filter
if (isset($filter[$docId])) {
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
$result[$shift + $docId] = $positions;
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
// ---------------------------------------------------------------
} else {
// Perform full scan
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$freqs[$docId] = 1;
} else {
$docId += $docDelta/2;
$freqs[$docId] = $frqFile->readVInt();
}
}
$updatedFilterData = array();
$result = array();
$prxFile = $this->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer, SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$termPosition = 0;
$positions = array();
// we have to read .prx file to get right position for next doc
// even filter doesn't match current document
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
// Include into updated filter and into result only if doc is matched by filter
if (isset($filter[$docId])) {
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
$result[$shift + $docId] = $positions;
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
}
} else {
// Filter doesn't has data for current segment
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$freqs[$docId] = 1;
} else {
$docId += $docDelta/2;
$freqs[$docId] = $frqFile->readVInt();
}
}
$filterData = array();
$result = array();
$prxFile = $this->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer, SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
$termPosition = 0;
$positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
$result[$shift + $docId] = $positions;
}
$docsFilter->segmentFilters[$this->_name] = $filterData;
}
} else {
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$freqs[$docId] = 1;
} else {
$docId += $docDelta/2;
$freqs[$docId] = $frqFile->readVInt();
}
}
$result = array();
$prxFile = $this->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer, SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$termPosition = 0;
$positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
$result[$shift + $docId] = $positions;
}
}
return $result;
}
/**
* Load normalizatin factors from an index file
*
* @param integer $fieldNum
* @throws Zend_Search_Lucene_Exception
*/
private function _loadNorm($fieldNum)
{
if ($this->_hasSingleNormFile) {
$normfFile = $this->openCompoundFile('.nrm');
$header = $normfFile->readBytes(3);
$headerFormatVersion = $normfFile->readByte();
if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) {
throw new Zend_Search_Lucene_Exception('Wrong norms file format.');
}
foreach ($this->_fields as $fNum => $fieldInfo) {
if ($fieldInfo->isIndexed) {
$this->_norms[$fNum] = $normfFile->readBytes($this->_docCount);
}
}
} else {
$fFile = $this->openCompoundFile('.f' . $fieldNum);
$this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
}
}
/**
* Returns normalization factor for specified documents
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName)
{
$fieldNum = $this->getFieldNum($fieldName);
if ( !($this->_fields[$fieldNum]->isIndexed) ) {
return null;
}
if (!isset($this->_norms[$fieldNum])) {
$this->_loadNorm($fieldNum);
}
return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) );
}
/**
* Returns norm vector, encoded in a byte string
*
* @param string $fieldName
* @return string
*/
public function normVector($fieldName)
{
$fieldNum = $this->getFieldNum($fieldName);
if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) {
$similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
$this->_docCount);
}
if (!isset($this->_norms[$fieldNum])) {
$this->_loadNorm($fieldNum);
}
return $this->_norms[$fieldNum];
}
/**
* Returns true if any documents have been deleted from this index segment.
*
* @return boolean
*/
public function hasDeletions()
{
return $this->_deleted !== null;
}
/**
* Returns true if segment has single norms file.
*
* @return boolean
*/
public function hasSingleNormFile()
{
return $this->_hasSingleNormFile ? true : false;
}
/**
* Returns true if segment is stored using compound segment file.
*
* @return boolean
*/
public function isCompound()
{
return $this->_isCompound;
}
/**
* Deletes a document from the index segment.
* $id is an internal document id
*
* @param integer
*/
public function delete($id)
{
$this->_deletedDirty = true;
if (extension_loaded('bitset')) {
if ($this->_deleted === null) {
$this->_deleted = bitset_empty($id);
}
bitset_incl($this->_deleted, $id);
} else {
if ($this->_deleted === null) {
$this->_deleted = array();
}
$this->_deleted[$id] = 1;
}
}
/**
* Checks, that document is deleted
*
* @param integer
* @return boolean
*/
public function isDeleted($id)
{
if ($this->_deleted === null) {
return false;
}
if (extension_loaded('bitset')) {
return bitset_in($this->_deleted, $id);
} else {
return isset($this->_deleted[$id]);
}
}
/**
* Detect latest delete generation
*
* Is actualy used from writeChanges() method or from the constructor if it's invoked from
* Index writer. In both cases index write lock is already obtained, so we shouldn't care
* about it
*/
private function _detectLatestDelGen()
{
$delFileList = array();
foreach ($this->_directory->fileList() as $file) {
if ($file == $this->_name . '.del') {
// Matches <segment_name>.del file name
$delFileList[] = 0;
} else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) {
// Matches <segment_name>_NNN.del file names
$delFileList[] = (int)base_convert($matches[1], 36, 10);
}
}
if (count($delFileList) == 0) {
// There is no deletions file for current segment in the directory
// Set detetions file generation number to 1
$this->_delGen = -1;
} else {
// There are some deletions files for current segment in the directory
// Set deletions file generation number to the highest nuber
$this->_delGen = max($delFileList);
}
}
/**
* Write changes if it's necessary.
*
* This method must be invoked only from the Writer _updateSegments() method,
* so index Write lock has to be already obtained.
*
* @internal
*/
public function writeChanges()
{
if (!$this->_deletedDirty) {
return;
}
if (extension_loaded('bitset')) {
$delBytes = $this->_deleted;
$bitCount = count(bitset_to_array($delBytes));
} else {
$byteCount = floor($this->_docCount/8)+1;
$delBytes = str_repeat(chr(0), $byteCount);
for ($count = 0; $count < $byteCount; $count++) {
$byte = 0;
for ($bit = 0; $bit < 8; $bit++) {
if (isset($this->_deleted[$count*8 + $bit])) {
$byte |= (1<<$bit);
}
}
$delBytes[$count] = chr($byte);
}
$bitCount = count($this->_deleted);
}
// Get new generation number
$this->_detectLatestDelGen();
if ($this->_delGen == -1) {
// Set delete file generation number to 1
$this->_delGen = 1;
} else {
// Increase delete file generation number by 1
$this->_delGen++;
}
$delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
$delFile->writeInt($this->_docCount);
$delFile->writeInt($bitCount);
$delFile->writeBytes($delBytes);
$this->_deletedDirty = false;
}
/**
* Term Dictionary File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_tisFile = null;
/**
* Actual offset of the .tis file data
*
* @var integer
*/
private $_tisFileOffset;
/**
* Frequencies File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_frqFile = null;
/**
* Actual offset of the .frq file data
*
* @var integer
*/
private $_frqFileOffset;
/**
* Positions File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_prxFile = null;
/**
* Actual offset of the .prx file in the compound file
*
* @var integer
*/
private $_prxFileOffset;
/**
* Actual number of terms in term stream
*
* @var integer
*/
private $_termCount = 0;
/**
* Overall number of terms in term stream
*
* @var integer
*/
private $_termNum = 0;
/**
* Segment index interval
*
* @var integer
*/
private $_indexInterval;
/**
* Segment skip interval
*
* @var integer
*/
private $_skipInterval;
/**
* Last TermInfo in a terms stream
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_lastTermInfo = null;
/**
* Last Term in a terms stream
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_lastTerm = null;
/**
* Map of the document IDs
* Used to get new docID after removing deleted documents.
* It's not very effective from memory usage point of view,
* but much more faster, then other methods
*
* @var array|null
*/
private $_docMap = null;
/**
* An array of all term positions in the documents.
* Array structure: array( docId => array( pos1, pos2, ...), ...)
*
* Is set to null if term positions loading has to be skipped
*
* @var array|null
*/
private $_lastTermPositions;
/**
* Terms scan mode
*
* Values:
*
* self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved
* self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved
* self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved
* document numbers are compacted (shifted if segment has deleted documents)
*
* @var integer
*/
private $_termsScanMode;
/** Scan modes */
const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved
const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved
const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved
// document numbers are compacted (shifted if segment contains deleted documents)
/**
* Reset terms stream
*
* $startId - id for the fist document
* $compact - remove deleted documents
*
* Returns start document id for the next segment
*
* @param integer $startId
* @param integer $mode
* @throws Zend_Search_Lucene_Exception
* @return integer
*/
public function reset($startId = 0, $mode = self::SM_TERMS_ONLY)
{
if ($this->_tisFile !== null) {
$this->_tisFile = null;
}
$this->_tisFile = $this->openCompoundFile('.tis', false);
$this->_tisFileOffset = $this->_tisFile->tell();
$tiVersion = $this->_tisFile->readInt();
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
}
$this->_termCount =
$this->_termNum = $this->_tisFile->readLong(); // Read terms count
$this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval
$this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
$maxSkipLevels = $this->_tisFile->readInt();
}
if ($this->_frqFile !== null) {
$this->_frqFile = null;
}
if ($this->_prxFile !== null) {
$this->_prxFile = null;
}
$this->_docMap = array();
$this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
$this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
$this->_lastTermPositions = null;
$this->_termsScanMode = $mode;
switch ($mode) {
case self::SM_TERMS_ONLY:
// Do nothing
break;
case self::SM_FULL_INFO:
// break intentionally omitted
case self::SM_MERGE_INFO:
$this->_frqFile = $this->openCompoundFile('.frq', false);
$this->_frqFileOffset = $this->_frqFile->tell();
$this->_prxFile = $this->openCompoundFile('.prx', false);
$this->_prxFileOffset = $this->_prxFile->tell();
for ($count = 0; $count < $this->_docCount; $count++) {
if (!$this->isDeleted($count)) {
$this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count);
}
}
break;
default:
throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.');
break;
}
$this->nextTerm();
return $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
}
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param Zend_Search_Lucene_Index_Term $prefix
* @throws Zend_Search_Lucene_Exception
*/
public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
{
if ($this->_termDictionary === null) {
$this->_loadDictionaryIndex();
}
$searchField = $this->getFieldNum($prefix->field);
if ($searchField == -1) {
/**
* Field is not presented in this segment
* Go to the end of dictionary
*/
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
$this->_lastTerm = null;
$this->_lastTermInfo = null;
$this->_lastTermPositions = null;
return;
}
$searchDicField = $this->_getFieldPosition($searchField);
// search for appropriate value in dictionary
$lowIndex = 0;
$highIndex = count($this->_termDictionary)-1;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$midTerm = $this->_termDictionary[$mid];
$fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
$delta = $searchDicField - $fieldNum;
if ($delta == 0) {
$delta = strcmp($prefix->text, $midTerm[1] /* text */);
}
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
// We have reached term we are looking for
break;
}
}
if ($highIndex == -1) {
// Term is out of the dictionary range
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
$this->_lastTerm = null;
$this->_lastTermInfo = null;
$this->_lastTermPositions = null;
return;
}
$prevPosition = $highIndex;
$prevTerm = $this->_termDictionary[$prevPosition];
$prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
if ($this->_tisFile === null) {
// The end of terms stream is reached and terms dictionary file is closed
// Perform mini-reset operation
$this->_tisFile = $this->openCompoundFile('.tis', false);
if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
$this->_frqFile = $this->openCompoundFile('.frq', false);
$this->_prxFile = $this->openCompoundFile('.prx', false);
}
}
$this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET);
$this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */,
($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name);
$this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */,
$prevTermInfo[1] /* freqPointer */,
$prevTermInfo[2] /* proxPointer */,
$prevTermInfo[3] /* skipOffset */);
$this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval;
if ($highIndex == 0) {
// skip start entry
$this->nextTerm();
} else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) {
// We got exact match in the dictionary index
if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
$this->_lastTermPositions = array();
$this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
$freqs = array(); $docId = 0;
for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
$docDelta = $this->_frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
$freqs[ $docId ] = 1;
} else {
$docId += $docDelta/2;
$freqs[ $docId ] = $this->_frqFile->readVInt();
}
}
$this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
foreach ($freqs as $docId => $freq) {
$termPosition = 0; $positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $this->_prxFile->readVInt();
$positions[] = $termPosition;
}
if (isset($this->_docMap[$docId])) {
$this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
}
}
}
return;
}
// Search term matching specified prefix
while ($this->_lastTerm !== null) {
if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 ||
($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) {
// Current term matches or greate than the pattern
return;
}
$this->nextTerm();
}
}
/**
* Scans terms dictionary and returns next term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function nextTerm()
{
if ($this->_tisFile === null || $this->_termCount == 0) {
$this->_lastTerm = null;
$this->_lastTermInfo = null;
$this->_lastTermPositions = null;
$this->_docMap = null;
// may be necessary for "empty" segment
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
return null;
}
$termPrefixLength = $this->_tisFile->readVInt();
$termSuffix = $this->_tisFile->readString();
$termFieldNum = $this->_tisFile->readVInt();
$termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
$this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
$docFreq = $this->_tisFile->readVInt();
$freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
$proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
if ($docFreq >= $this->_skipInterval) {
$skipOffset = $this->_tisFile->readVInt();
} else {
$skipOffset = 0;
}
$this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
$this->_lastTermPositions = array();
$this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
$freqs = array(); $docId = 0;
for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
$docDelta = $this->_frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
$freqs[ $docId ] = 1;
} else {
$docId += $docDelta/2;
$freqs[ $docId ] = $this->_frqFile->readVInt();
}
}
$this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
foreach ($freqs as $docId => $freq) {
$termPosition = 0; $positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $this->_prxFile->readVInt();
$positions[] = $termPosition;
}
if (isset($this->_docMap[$docId])) {
$this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
}
}
}
$this->_termCount--;
if ($this->_termCount == 0) {
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
}
return $this->_lastTerm;
}
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream()
{
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
$this->_lastTerm = null;
$this->_lastTermInfo = null;
$this->_lastTermPositions = null;
$this->_docMap = null;
}
/**
* Returns term in current position
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function currentTerm()
{
return $this->_lastTerm;
}
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @return array
*/
public function currentTermPositions()
{
return $this->_lastTermPositions;
}
}
PK =hH[(t�%F F SegmentInfoPriorityQueue.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene */
require_once 'Zend/Search/Lucene/PriorityQueue.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentInfoPriorityQueue extends Zend_Search_Lucene_PriorityQueue
{
/**
* Compare elements
*
* Returns true, if $el1 is less than $el2; else otherwise
*
* @param mixed $segmentInfo1
* @param mixed $segmentInfo2
* @return boolean
*/
protected function _less($segmentInfo1, $segmentInfo2)
{
return strcmp($segmentInfo1->currentTerm()->key(), $segmentInfo2->currentTerm()->key()) < 0;
}
}
PK =hH[�>;�� � SegmentWriter/StreamWriter.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/** Zend_Search_Lucene_Index_SegmentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentWriter_StreamWriter extends Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
parent::__construct($directory, $name);
}
/**
* Create stored fields files and open them for write
*/
public function createStoredFieldsFiles()
{
$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx';
$this->_files[] = $this->_name . '.fdt';
}
public function addNorm($fieldName, $normVector)
{
if (isset($this->_norms[$fieldName])) {
$this->_norms[$fieldName] .= $normVector;
} else {
$this->_norms[$fieldName] = $normVector;
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_generateCFS();
return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$this->_name,
$this->_docCount,
-1,
null,
true,
true);
}
}
PK =hH[#$ �� � SegmentWriter/DocumentWriter.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/** Zend_Search_Lucene_Index_SegmentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Term Dictionary
* Array of the Zend_Search_Lucene_Index_Term objects
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
*/
protected $_termDictionary;
/**
* Documents, which contain the term
*
* @var array
*/
protected $_termDocs;
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
parent::__construct($directory, $name);
$this->_termDocs = array();
$this->_termDictionary = array();
}
/**
* Adds a document to this segment.
*
* @param Zend_Search_Lucene_Document $document
* @throws Zend_Search_Lucene_Exception
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
$storedFields = array();
$docNorms = array();
$similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
foreach ($document->getFieldNames() as $fieldName) {
$field = $document->getField($fieldName);
$this->addField($field);
if ($field->storeTermVector) {
/**
* @todo term vector storing support
*/
throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
}
if ($field->isIndexed) {
if ($field->isTokenized) {
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$analyzer->setInput($field->value, $field->encoding);
$position = 0;
$tokenCounter = 0;
while (($token = $analyzer->nextToken()) !== null) {
$tokenCounter++;
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$position += $token->getPositionIncrement();
$this->_termDocs[$termKey][$this->_docCount][] = $position;
}
$docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
$tokenCounter)*
$document->boost*
$field->boost ));
} else {
$term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$this->_termDocs[$termKey][$this->_docCount][] = 0; // position
$docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
$document->boost*
$field->boost ));
}
}
if ($field->isStored) {
$storedFields[] = $field;
}
}
foreach ($this->_fields as $fieldName => $field) {
if (!$field->isIndexed) {
continue;
}
if (!isset($this->_norms[$fieldName])) {
$this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
$this->_docCount);
}
if (isset($docNorms[$fieldName])){
$this->_norms[$fieldName] .= $docNorms[$fieldName];
} else {
$this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
}
}
$this->addStoredFields($storedFields);
}
/**
* Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
*/
protected function _dumpDictionary()
{
ksort($this->_termDictionary, SORT_STRING);
$this->initializeDictionaryFiles();
foreach ($this->_termDictionary as $termId => $term) {
$this->addTerm($term, $this->_termDocs[$termId]);
}
$this->closeDictionaryFiles();
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_dumpDictionary();
$this->_generateCFS();
return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$this->_name,
$this->_docCount,
-1,
null,
true,
true);
}
}
PK =hH[���cX� X�
Writer.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/** Zend_Search_Lucene_Index_SegmentMerger */
require_once 'Zend/Search/Lucene/Index/SegmentMerger.php';
/** Zend_Search_Lucene_LockManager */
require_once 'Zend/Search/Lucene/LockManager.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Writer
{
/**
* @todo Implement Analyzer substitution
* @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
* temporary index files
* @todo Directory lock processing
*/
/**
* Number of documents required before the buffered in-memory
* documents are written into a new Segment
*
* Default value is 10
*
* @var integer
*/
public $maxBufferedDocs = 10;
/**
* Largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @var integer
*/
public $maxMergeDocs = PHP_INT_MAX;
/**
* Determines how often segment indices are merged by addDocument().
*
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
*
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
*
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @var integer
*/
public $mergeFactor = 10;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
private $_directory = null;
/**
* Changes counter.
*
* @var integer
*/
private $_versionUpdate = 0;
/**
* List of the segments, created by index writer
* Array of Zend_Search_Lucene_Index_SegmentInfo objects
*
* @var array
*/
private $_newSegments = array();
/**
* List of segments to be deleted on commit
*
* @var array
*/
private $_segmentsToDelete = array();
/**
* Current segment to add documents
*
* @var Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter
*/
private $_currentSegment = null;
/**
* Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
*
* It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array
*
* @var array Zend_Search_Lucene_Index_SegmentInfo
*/
private $_segmentInfos;
/**
* Index target format version
*
* @var integer
*/
private $_targetFormatVersion;
/**
* List of indexfiles extensions
*
* @var array
*/
private static $_indexExtensions = array('.cfs' => '.cfs',
'.cfx' => '.cfx',
'.fnm' => '.fnm',
'.fdx' => '.fdx',
'.fdt' => '.fdt',
'.tis' => '.tis',
'.tii' => '.tii',
'.frq' => '.frq',
'.prx' => '.prx',
'.tvx' => '.tvx',
'.tvd' => '.tvd',
'.tvf' => '.tvf',
'.del' => '.del',
'.sti' => '.sti' );
/**
* Create empty index
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param integer $generation
* @param integer $nameCount
*/
public static function createIndex(Zend_Search_Lucene_Storage_Directory $directory, $generation, $nameCount)
{
if ($generation == 0) {
// Create index in pre-2.1 mode
foreach ($directory->fileList() as $file) {
if ($file == 'deletable' ||
$file == 'segments' ||
isset(self::$_indexExtensions[ substr($file, strlen($file)-4)]) ||
preg_match('/\.f\d+$/i', $file) /* matches <segment_name>.f<decimal_nmber> file names */) {
$directory->deleteFile($file);
}
}
$segmentsFile = $directory->createFile('segments');
$segmentsFile->writeInt((int)0xFFFFFFFF);
// write version (is initialized by current time
// $segmentsFile->writeLong((int)microtime(true));
$version = microtime(true);
$segmentsFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
$segmentsFile->writeInt((int)($version & 0xFFFFFFFF));
// write name counter
$segmentsFile->writeInt($nameCount);
// write segment counter
$segmentsFile->writeInt(0);
$deletableFile = $directory->createFile('deletable');
// write counter
$deletableFile->writeInt(0);
} else {
$genFile = $directory->createFile('segments.gen');
$genFile->writeInt((int)0xFFFFFFFE);
// Write generation two times
$genFile->writeLong($generation);
$genFile->writeLong($generation);
$segmentsFile = $directory->createFile(Zend_Search_Lucene::getSegmentFileName($generation));
$segmentsFile->writeInt((int)0xFFFFFFFD);
// write version (is initialized by current time
// $segmentsFile->writeLong((int)microtime(true));
$version = microtime(true);
$segmentsFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
$segmentsFile->writeInt((int)($version & 0xFFFFFFFF));
// write name counter
$segmentsFile->writeInt($nameCount);
// write segment counter
$segmentsFile->writeInt(0);
}
}
/**
* Open the index for writing
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param array $segmentInfos
* @param integer $targetFormatVersion
* @param Zend_Search_Lucene_Storage_File $cleanUpLock
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, &$segmentInfos, $targetFormatVersion)
{
$this->_directory = $directory;
$this->_segmentInfos = &$segmentInfos;
$this->_targetFormatVersion = $targetFormatVersion;
}
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
if ($this->_currentSegment === null) {
$this->_currentSegment =
new Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter($this->_directory, $this->_newSegmentName());
}
$this->_currentSegment->addDocument($document);
if ($this->_currentSegment->count() >= $this->maxBufferedDocs) {
$this->commit();
}
$this->_maybeMergeSegments();
$this->_versionUpdate++;
}
/**
* Check if we have anything to merge
*
* @return boolean
*/
private function _hasAnythingToMerge()
{
$segmentSizes = array();
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentSizes[$segName] = $segmentInfo->count();
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge = $this->maxBufferedDocs;
asort($segmentSizes, SORT_NUMERIC);
foreach ($segmentSizes as $segName => $size) {
// Check, if segment comes into a new merging block
while ($size >= $sizeToMerge) {
// Merge previous block if it's large enough
if ($poolSize >= $sizeToMerge) {
return true;
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge *= $this->mergeFactor;
if ($sizeToMerge > $this->maxMergeDocs) {
return false;
}
}
$mergePool[] = $this->_segmentInfos[$segName];
$poolSize += $size;
}
if ($poolSize >= $sizeToMerge) {
return true;
}
return false;
}
/**
* Merge segments if necessary
*/
private function _maybeMergeSegments()
{
if (Zend_Search_Lucene_LockManager::obtainOptimizationLock($this->_directory) === false) {
return;
}
if (!$this->_hasAnythingToMerge()) {
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return;
}
// Update segments list to be sure all segments are not merged yet by another process
//
// Segment merging functionality is concentrated in this class and surrounded
// by optimization lock obtaining/releasing.
// _updateSegments() refreshes segments list from the latest index generation.
// So only new segments can be added to the index while we are merging some already existing
// segments.
// Newly added segments will be also included into the index by the _updateSegments() call
// either by another process or by the current process with the commit() call at the end of _mergeSegments() method.
// That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks.
$this->_updateSegments();
// Perform standard auto-optimization procedure
$segmentSizes = array();
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentSizes[$segName] = $segmentInfo->count();
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge = $this->maxBufferedDocs;
asort($segmentSizes, SORT_NUMERIC);
foreach ($segmentSizes as $segName => $size) {
// Check, if segment comes into a new merging block
while ($size >= $sizeToMerge) {
// Merge previous block if it's large enough
if ($poolSize >= $sizeToMerge) {
$this->_mergeSegments($mergePool);
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge *= $this->mergeFactor;
if ($sizeToMerge > $this->maxMergeDocs) {
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return;
}
}
$mergePool[] = $this->_segmentInfos[$segName];
$poolSize += $size;
}
if ($poolSize >= $sizeToMerge) {
$this->_mergeSegments($mergePool);
}
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
}
/**
* Merge specified segments
*
* $segments is an array of SegmentInfo objects
*
* @param array $segments
*/
private function _mergeSegments($segments)
{
$newName = $this->_newSegmentName();
$merger = new Zend_Search_Lucene_Index_SegmentMerger($this->_directory,
$newName);
foreach ($segments as $segmentInfo) {
$merger->addSource($segmentInfo);
$this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName();
}
$newSegment = $merger->merge();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->commit();
}
/**
* Update segments file by adding current segment to a list
*
* @throws Zend_Search_Lucene_Exception
*/
private function _updateSegments()
{
// Get an exclusive index lock
Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
// Write down changes for the segments
foreach ($this->_segmentInfos as $segInfo) {
$segInfo->writeChanges();
}
$generation = Zend_Search_Lucene::getActualGeneration($this->_directory);
$segmentsFile = $this->_directory->getFileObject(Zend_Search_Lucene::getSegmentFileName($generation), false);
$newSegmentFile = $this->_directory->createFile(Zend_Search_Lucene::getSegmentFileName(++$generation), false);
try {
$genFile = $this->_directory->getFileObject('segments.gen', false);
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') !== false) {
$genFile = $this->_directory->createFile('segments.gen');
} else {
throw $e;
}
}
$genFile->writeInt((int)0xFFFFFFFE);
// Write generation (first copy)
$genFile->writeLong($generation);
try {
// Write format marker
if ($this->_targetFormatVersion == Zend_Search_lucene::FORMAT_2_1) {
$newSegmentFile->writeInt((int)0xFFFFFFFD);
} else if ($this->_targetFormatVersion == Zend_Search_lucene::FORMAT_2_3) {
$newSegmentFile->writeInt((int)0xFFFFFFFC);
}
// Read src file format identifier
$format = $segmentsFile->readInt();
if ($format == (int)0xFFFFFFFF) {
$srcFormat = Zend_Search_Lucene::FORMAT_PRE_2_1;
} else if ($format == (int)0xFFFFFFFD) {
$srcFormat = Zend_Search_Lucene::FORMAT_2_1;
} else if ($format == (int)0xFFFFFFFC) {
$srcFormat = Zend_Search_Lucene::FORMAT_2_3;
} else {
throw new Zend_Search_Lucene_Exception('Unsupported segments file format');
}
// $version = $segmentsFile->readLong() + $this->_versionUpdate;
// Process version on 32-bit platforms
$versionHigh = $segmentsFile->readInt();
$versionLow = $segmentsFile->readInt();
$version = $versionHigh * ((double)0xFFFFFFFF + 1) +
(($versionLow < 0)? (double)0xFFFFFFFF - (-1 - $versionLow) : $versionLow);
$version += $this->_versionUpdate;
$this->_versionUpdate = 0;
$newSegmentFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
$newSegmentFile->writeInt((int)($version & 0xFFFFFFFF));
// Write segment name counter
$newSegmentFile->writeInt($segmentsFile->readInt());
// Get number of segments offset
$numOfSegmentsOffset = $newSegmentFile->tell();
// Write dummy data (segment counter)
$newSegmentFile->writeInt(0);
// Read number of segemnts
$segmentsCount = $segmentsFile->readInt();
$segments = array();
for ($count = 0; $count < $segmentsCount; $count++) {
$segName = $segmentsFile->readString();
$segSize = $segmentsFile->readInt();
if ($srcFormat == Zend_Search_Lucene::FORMAT_PRE_2_1) {
// pre-2.1 index format
$delGenHigh = 0;
$delGenLow = 0;
$hasSingleNormFile = false;
$numField = (int)0xFFFFFFFF;
$isCompoundByte = 0;
$docStoreOptions = null;
} else {
//$delGen = $segmentsFile->readLong();
$delGenHigh = $segmentsFile->readInt();
$delGenLow = $segmentsFile->readInt();
if ($srcFormat == Zend_Search_Lucene::FORMAT_2_3) {
$docStoreOffset = $segmentsFile->readInt();
if ($docStoreOffset != -1) {
$docStoreSegment = $segmentsFile->readString();
$docStoreIsCompoundFile = $segmentsFile->readByte();
$docStoreOptions = array('offset' => $docStoreOffset,
'segment' => $docStoreSegment,
'isCompound' => ($docStoreIsCompoundFile == 1));
} else {
$docStoreOptions = null;
}
} else {
$docStoreOptions = null;
}
$hasSingleNormFile = $segmentsFile->readByte();
$numField = $segmentsFile->readInt();
$normGens = array();
if ($numField != (int)0xFFFFFFFF) {
for ($count1 = 0; $count1 < $numField; $count1++) {
$normGens[] = $segmentsFile->readLong();
}
}
$isCompoundByte = $segmentsFile->readByte();
}
if (!in_array($segName, $this->_segmentsToDelete)) {
// Load segment if necessary
if (!isset($this->_segmentInfos[$segName])) {
if (PHP_INT_SIZE > 4) {
// 64-bit system
$delGen = $delGenHigh << 32 |
$delGenLow;
} else {
$delGen = $delGenHigh * ((double)0xFFFFFFFF + 1) +
(($delGenLow < 0)? (double)0xFFFFFFFF - (-1 - $delGenLow) : $delGenLow);
}
if ($isCompoundByte == 0xFF) {
// The segment is not a compound file
$isCompound = false;
} else if ($isCompoundByte == 0x00) {
// The status is unknown
$isCompound = null;
} else if ($isCompoundByte == 0x01) {
// The segment is a compound file
$isCompound = true;
}
$this->_segmentInfos[$segName] =
new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$segName,
$segSize,
$delGen,
$docStoreOptions,
$hasSingleNormFile,
$isCompound);
} else {
// Retrieve actual deletions file generation number
$delGen = $this->_segmentInfos[$segName]->getDelGen();
if ($delGen >= 0) {
if (PHP_INT_SIZE > 4) {
// 64-bit system
$delGenHigh = $delGen >> 32 & 0xFFFFFFFF;
$delGenLow = $delGen & 0xFFFFFFFF;
} else {
$delGenHigh = (int)($delGen/((double)0xFFFFFFFF + 1));
$delGenLow =(int)($delGen & 0xFFFFFFFF);
}
} else {
$delGenHigh = $delGenLow = (int)0xFFFFFFFF;
}
}
$newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segSize);
$newSegmentFile->writeInt($delGenHigh);
$newSegmentFile->writeInt($delGenLow);
if ($this->_targetFormatVersion == Zend_Search_Lucene::FORMAT_2_3) {
if ($docStoreOptions !== null) {
$newSegmentFile->writeInt($docStoreOffset);
$newSegmentFile->writeString($docStoreSegment);
$newSegmentFile->writeByte($docStoreIsCompoundFile);
} else {
// Set DocStoreOffset to -1
$newSegmentFile->writeInt((int)0xFFFFFFFF);
}
} else if ($docStoreOptions !== null) {
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
throw new Zend_Search_Lucene_Exception('Index conversion to lower format version is not supported.');
}
$newSegmentFile->writeByte($hasSingleNormFile);
$newSegmentFile->writeInt($numField);
if ($numField != (int)0xFFFFFFFF) {
foreach ($normGens as $normGen) {
$newSegmentFile->writeLong($normGen);
}
}
$newSegmentFile->writeByte($isCompoundByte);
$segments[$segName] = $segSize;
}
}
$segmentsFile->close();
$segmentsCount = count($segments) + count($this->_newSegments);
foreach ($this->_newSegments as $segName => $segmentInfo) {
$newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segmentInfo->count());
// delete file generation: -1 (there is no delete file yet)
$newSegmentFile->writeInt((int)0xFFFFFFFF);$newSegmentFile->writeInt((int)0xFFFFFFFF);
if ($this->_targetFormatVersion == Zend_Search_Lucene::FORMAT_2_3) {
// docStoreOffset: -1 (segment doesn't use shared doc store)
$newSegmentFile->writeInt((int)0xFFFFFFFF);
}
// HasSingleNormFile
$newSegmentFile->writeByte($segmentInfo->hasSingleNormFile());
// NumField
$newSegmentFile->writeInt((int)0xFFFFFFFF);
// IsCompoundFile
$newSegmentFile->writeByte($segmentInfo->isCompound() ? 1 : -1);
$segments[$segmentInfo->getName()] = $segmentInfo->count();
$this->_segmentInfos[$segName] = $segmentInfo;
}
$this->_newSegments = array();
$newSegmentFile->seek($numOfSegmentsOffset);
$newSegmentFile->writeInt($segmentsCount); // Update segments count
$newSegmentFile->close();
} catch (Exception $e) {
/** Restore previous index generation */
$generation--;
$genFile->seek(4, SEEK_SET);
// Write generation number twice
$genFile->writeLong($generation); $genFile->writeLong($generation);
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
// Throw the exception
throw $e;
}
// Write generation (second copy)
$genFile->writeLong($generation);
// Check if another update or read process is not running now
// If yes, skip clean-up procedure
if (Zend_Search_Lucene_LockManager::escalateReadLock($this->_directory)) {
/**
* Clean-up directory
*/
$filesToDelete = array();
$filesTypes = array();
$filesNumbers = array();
// list of .del files of currently used segments
// each segment can have several generations of .del files
// only last should not be deleted
$delFiles = array();
foreach ($this->_directory->fileList() as $file) {
if ($file == 'deletable') {
// 'deletable' file
$filesToDelete[] = $file;
$filesTypes[] = 0; // delete this file first, since it's not used starting from Lucene v2.1
$filesNumbers[] = 0;
} else if ($file == 'segments') {
// 'segments' file
$filesToDelete[] = $file;
$filesTypes[] = 1; // second file to be deleted "zero" version of segments file (Lucene pre-2.1)
$filesNumbers[] = 0;
} else if (preg_match('/^segments_[a-zA-Z0-9]+$/i', $file)) {
// 'segments_xxx' file
// Check if it's not a just created generation file
if ($file != Zend_Search_Lucene::getSegmentFileName($generation)) {
$filesToDelete[] = $file;
$filesTypes[] = 2; // first group of files for deletions
$filesNumbers[] = (int)base_convert(substr($file, 9), 36, 10); // ordered by segment generation numbers
}
} else if (preg_match('/(^_([a-zA-Z0-9]+))\.f\d+$/i', $file, $matches)) {
// one of per segment files ('<segment_name>.f<decimal_number>')
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$matches[1]])) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number
}
} else if (preg_match('/(^_([a-zA-Z0-9]+))(_([a-zA-Z0-9]+))\.del$/i', $file, $matches)) {
// one of per segment files ('<segment_name>_<del_generation>.del' where <segment_name> is '_<segment_number>')
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$matches[1]])) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number
} else {
$segmentNumber = (int)base_convert($matches[2], 36, 10);
$delGeneration = (int)base_convert($matches[4], 36, 10);
if (!isset($delFiles[$segmentNumber])) {
$delFiles[$segmentNumber] = array();
}
$delFiles[$segmentNumber][$delGeneration] = $file;
}
} else if (isset(self::$_indexExtensions[substr($file, strlen($file)-4)])) {
// one of per segment files ('<segment_name>.<ext>')
$segmentName = substr($file, 0, strlen($file) - 4);
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$segmentName]) &&
($this->_currentSegment === null || $this->_currentSegment->getName() != $segmentName)) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert(substr($file, 1 /* skip '_' */, strlen($file)-5), 36, 10); // order by segment number
}
}
}
$maxGenNumber = 0;
// process .del files of currently used segments
foreach ($delFiles as $segmentNumber => $segmentDelFiles) {
ksort($delFiles[$segmentNumber], SORT_NUMERIC);
array_pop($delFiles[$segmentNumber]); // remove last delete file generation from candidates for deleting
end($delFiles[$segmentNumber]);
$lastGenNumber = key($delFiles[$segmentNumber]);
if ($lastGenNumber > $maxGenNumber) {
$maxGenNumber = $lastGenNumber;
}
}
foreach ($delFiles as $segmentNumber => $segmentDelFiles) {
foreach ($segmentDelFiles as $delGeneration => $file) {
$filesToDelete[] = $file;
$filesTypes[] = 4; // third group of files for deletions
$filesNumbers[] = $segmentNumber*$maxGenNumber + $delGeneration; // order by <segment_number>,<del_generation> pair
}
}
// Reorder files for deleting
array_multisort($filesTypes, SORT_ASC, SORT_NUMERIC,
$filesNumbers, SORT_ASC, SORT_NUMERIC,
$filesToDelete, SORT_ASC, SORT_STRING);
foreach ($filesToDelete as $file) {
try {
/** Skip shared docstore segments deleting */
/** @todo Process '.cfx' files to check if them are already unused */
if (substr($file, strlen($file)-4) != '.cfx') {
$this->_directory->deleteFile($file);
}
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'Can\'t delete file') === false) {
// That's not "file is under processing or already deleted" exception
// Pass it through
throw $e;
}
}
}
// Return read lock into the previous state
Zend_Search_Lucene_LockManager::deEscalateReadLock($this->_directory);
} else {
// Only release resources if another index reader is running now
foreach ($this->_segmentsToDelete as $segName) {
foreach (self::$_indexExtensions as $ext) {
$this->_directory->purgeFile($segName . $ext);
}
}
}
// Clean-up _segmentsToDelete container
$this->_segmentsToDelete = array();
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
// Remove unused segments from segments list
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
if (!isset($segments[$segName])) {
unset($this->_segmentInfos[$segName]);
}
}
}
/**
* Commit current changes
*/
public function commit()
{
if ($this->_currentSegment !== null) {
$newSegment = $this->_currentSegment->close();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->_currentSegment = null;
}
$this->_updateSegments();
}
/**
* Merges the provided indexes into this index.
*
* @param array $readers
* @return void
*/
public function addIndexes($readers)
{
/**
* @todo implementation
*/
}
/**
* Merges all segments together into new one
*
* Returns true on success and false if another optimization or auto-optimization process
* is running now
*
* @return boolean
*/
public function optimize()
{
if (Zend_Search_Lucene_LockManager::obtainOptimizationLock($this->_directory) === false) {
return false;
}
// Update segments list to be sure all segments are not merged yet by another process
//
// Segment merging functionality is concentrated in this class and surrounded
// by optimization lock obtaining/releasing.
// _updateSegments() refreshes segments list from the latest index generation.
// So only new segments can be added to the index while we are merging some already existing
// segments.
// Newly added segments will be also included into the index by the _updateSegments() call
// either by another process or by the current process with the commit() call at the end of _mergeSegments() method.
// That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks.
$this->_updateSegments();
$this->_mergeSegments($this->_segmentInfos);
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return true;
}
/**
* Get name for new segment
*
* @return string
*/
private function _newSegmentName()
{
Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
$generation = Zend_Search_Lucene::getActualGeneration($this->_directory);
$segmentsFile = $this->_directory->getFileObject(Zend_Search_Lucene::getSegmentFileName($generation), false);
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$segmentNameCounter = $segmentsFile->readInt();
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$segmentsFile->writeInt($segmentNameCounter + 1);
// Flash output to guarantee that wrong value will not be loaded between unlock and
// return (which calls $segmentsFile destructor)
$segmentsFile->flush();
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
return '_' . base_convert($segmentNameCounter, 10, 36);
}
}
PK =hH[!��( ( DictionaryLoader.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/**
* Dictionary loader
*
* It's a dummy class which is created to encapsulate non-good structured code.
* Manual "method inlining" is performed to increase dictionary index loading operation
* which is major bottelneck for search performance.
*
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_DictionaryLoader
{
/**
* Dictionary index loader.
*
* It takes a string which is actually <segment_name>.tii index file data and
* returns two arrays - term and tremInfo lists.
*
* See Zend_Search_Lucene_Index_SegmintInfo class for details
*
* @param string $data
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public static function load($data)
{
$termDictionary = array();
$termInfos = array();
$pos = 0;
// $tiVersion = $tiiFile->readInt();
$tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
$pos += 4;
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
}
// $indexTermCount = $tiiFile->readLong();
if (PHP_INT_SIZE > 4) {
$indexTermCount = ord($data[$pos]) << 56 |
ord($data[$pos+1]) << 48 |
ord($data[$pos+2]) << 40 |
ord($data[$pos+3]) << 32 |
ord($data[$pos+4]) << 24 |
ord($data[$pos+5]) << 16 |
ord($data[$pos+6]) << 8 |
ord($data[$pos+7]);
} else {
if ((ord($data[$pos]) != 0) ||
(ord($data[$pos+1]) != 0) ||
(ord($data[$pos+2]) != 0) ||
(ord($data[$pos+3]) != 0) ||
((ord($data[$pos+4]) & 0x80) != 0)) {
throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
}
$indexTermCount = ord($data[$pos+4]) << 24 |
ord($data[$pos+5]) << 16 |
ord($data[$pos+6]) << 8 |
ord($data[$pos+7]);
}
$pos += 8;
// $tiiFile->readInt(); // IndexInterval
$pos += 4;
// $skipInterval = $tiiFile->readInt();
$skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
$pos += 4;
if ($indexTermCount < 1) {
throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
}
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
/* Skip MaxSkipLevels value */
$pos += 4;
}
$prevTerm = '';
$freqPointer = 0;
$proxPointer = 0;
$indexPointer = 0;
for ($count = 0; $count < $indexTermCount; $count++) {
//$termPrefixLength = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$termPrefixLength = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$termPrefixLength |= ($nbyte & 0x7F) << $shift;
}
// $termSuffix = $tiiFile->readString();
$nbyte = ord($data[$pos++]);
$len = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$len |= ($nbyte & 0x7F) << $shift;
}
if ($len == 0) {
$termSuffix = '';
} else {
$termSuffix = substr($data, $pos, $len);
$pos += $len;
for ($count1 = 0; $count1 < $len; $count1++ ) {
if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($termSuffix[$count1]) & 0x20 ) {
$addBytes++;
// Never used for Java Lucene created index.
// Java2 doesn't encode strings in four bytes
if (ord($termSuffix[$count1]) & 0x10 ) {
$addBytes++;
}
}
$termSuffix .= substr($data, $pos, $addBytes);
$pos += $addBytes;
$len += $addBytes;
// Check for null character. Java2 encodes null character
// in two bytes.
if (ord($termSuffix[$count1]) == 0xC0 &&
ord($termSuffix[$count1+1]) == 0x80 ) {
$termSuffix[$count1] = 0;
$termSuffix = substr($termSuffix,0,$count1+1)
. substr($termSuffix,$count1+2);
}
$count1 += $addBytes;
}
}
}
// $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
$pb = 0; $pc = 0;
while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
$charBytes = 1;
if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($prevTerm[$pb]) & 0x20 ) {
$charBytes++;
if (ord($prevTerm[$pb]) & 0x10 ) {
$charBytes++;
}
}
}
if ($pb + $charBytes > strlen($data)) {
// wrong character
break;
}
$pc++;
$pb += $charBytes;
}
$termValue = substr($prevTerm, 0, $pb) . $termSuffix;
// $termFieldNum = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$termFieldNum = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$termFieldNum |= ($nbyte & 0x7F) << $shift;
}
// $docFreq = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$docFreq = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$docFreq |= ($nbyte & 0x7F) << $shift;
}
// $freqPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$freqPointer += $vint;
// $proxPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$proxPointer += $vint;
if( $docFreq >= $skipInterval ) {
// $skipDelta = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$skipDelta = $vint;
} else {
$skipDelta = 0;
}
// $indexPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$indexPointer += $vint;
// $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
$termDictionary[] = array($termFieldNum, $termValue);
$termInfos[] =
// new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
$prevTerm = $termValue;
}
// Check special index entry mark
if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
} else if (PHP_INT_SIZE > 4){
// Treat 64-bit 0xFFFFFFFF as -1
$termDictionary[0][0] = -1;
}
return array(&$termDictionary, &$termInfos);
}
}
PK =hH[��� � DocsFilter.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* A Zend_Search_Lucene_Index_DocsFilter is used to filter documents while searching.
*
* It may or _may_not_ be used for actual filtering, so it's just a hint that upper query limits
* search result by specified list.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_DocsFilter
{
/**
* Set of segment filters:
* array( <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* ...
* )
*
* @var array
*/
public $segmentFilters = array();
}
PK =hH[堵 # # SegmentMerger.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php';
/** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */
require_once 'Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentMerger
{
/**
* Target segment writer
*
* @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter
*/
private $_writer;
/**
* Number of docs in a new segment
*
* @var integer
*/
private $_docCount;
/**
* A set of segments to be merged
*
* @var array Zend_Search_Lucene_Index_SegmentInfo
*/
private $_segmentInfos = array();
/**
* Flag to signal, that merge is already done
*
* @var boolean
*/
private $_mergeDone = false;
/**
* Field map
* [<segment_name>][<field_number>] => <target_field_number>
*
* @var array
*/
private $_fieldsMap = array();
/**
* Object constructor.
*
* Creates new segment merger with $directory as target to merge segments into
* and $name as a name of new segment
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct($directory, $name)
{
$this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name);
}
/**
* Add segmnet to a collection of segments to be merged
*
* @param Zend_Search_Lucene_Index_SegmentInfo $segment
*/
public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo)
{
$this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo;
}
/**
* Do merge.
*
* Returns number of documents in newly created segment
*
* @return Zend_Search_Lucene_Index_SegmentInfo
* @throws Zend_Search_Lucene_Exception
*/
public function merge()
{
if ($this->_mergeDone) {
throw new Zend_Search_Lucene_Exception('Merge is already done.');
}
if (count($this->_segmentInfos) < 1) {
throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged ('
. count($this->_segmentInfos)
. ').');
}
$this->_mergeFields();
$this->_mergeNorms();
$this->_mergeStoredFields();
$this->_mergeTerms();
$this->_mergeDone = true;
return $this->_writer->close();
}
/**
* Merge fields information
*/
private function _mergeFields()
{
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
foreach ($segmentInfo->getFieldInfos() as $fieldInfo) {
$this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo);
}
}
}
/**
* Merge field's normalization factors
*/
private function _mergeNorms()
{
foreach ($this->_writer->getFieldInfos() as $fieldInfo) {
if ($fieldInfo->isIndexed) {
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
if ($segmentInfo->hasDeletions()) {
$srcNorm = $segmentInfo->normVector($fieldInfo->name);
$norm = '';
$docs = $segmentInfo->count();
for ($count = 0; $count < $docs; $count++) {
if (!$segmentInfo->isDeleted($count)) {
$norm .= $srcNorm[$count];
}
}
$this->_writer->addNorm($fieldInfo->name, $norm);
} else {
$this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name));
}
}
}
}
}
/**
* Merge fields information
*/
private function _mergeStoredFields()
{
$this->_docCount = 0;
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$fdtFile = $segmentInfo->openCompoundFile('.fdt');
for ($count = 0; $count < $segmentInfo->count(); $count++) {
$fieldCount = $fdtFile->readVInt();
$storedFields = array();
for ($count2 = 0; $count2 < $fieldCount; $count2++) {
$fieldNum = $fdtFile->readVInt();
$bits = $fdtFile->readByte();
$fieldInfo = $segmentInfo->getField($fieldNum);
if (!($bits & 2)) { // Text data
$storedFields[] =
new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readString(),
'UTF-8',
true,
$fieldInfo->isIndexed,
$bits & 1 );
} else { // Binary data
$storedFields[] =
new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readBinary(),
'',
true,
$fieldInfo->isIndexed,
$bits & 1,
true);
}
}
if (!$segmentInfo->isDeleted($count)) {
$this->_docCount++;
$this->_writer->addStoredFields($storedFields);
}
}
}
}
/**
* Merge fields information
*/
private function _mergeTerms()
{
$segmentInfoQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue();
$segmentStartId = 0;
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentStartId = $segmentInfo->reset($segmentStartId, Zend_Search_Lucene_Index_SegmentInfo::SM_MERGE_INFO);
// Skip "empty" segments
if ($segmentInfo->currentTerm() !== null) {
$segmentInfoQueue->put($segmentInfo);
}
}
$this->_writer->initializeDictionaryFiles();
$termDocs = array();
while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
// Merge positions array
$termDocs += $segmentInfo->currentTermPositions();
if ($segmentInfoQueue->top() === null ||
$segmentInfoQueue->top()->currentTerm()->key() !=
$segmentInfo->currentTerm()->key()) {
// We got new term
ksort($termDocs, SORT_NUMERIC);
// Add term if it's contained in any document
if (count($termDocs) > 0) {
$this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs);
}
$termDocs = array();
}
$segmentInfo->nextTerm();
// check, if segment dictionary is finished
if ($segmentInfo->currentTerm() !== null) {
// Put segment back into the priority queue
$segmentInfoQueue->put($segmentInfo);
}
}
$this->_writer->closeDictionaryFiles();
}
}
PK =hH[3R�G TermInfo.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_TermInfo
{
/**
* The number of documents which contain the term.
*
* @var integer
*/
public $docFreq;
/**
* Data offset in a Frequencies file.
*
* @var integer
*/
public $freqPointer;
/**
* Data offset in a Positions file.
*
* @var integer
*/
public $proxPointer;
/**
* ScipData offset in a Frequencies file.
*
* @var integer
*/
public $skipOffset;
/**
* Term offset of the _next_ term in a TermDictionary file.
* Used only for Term Index
*
* @var integer
*/
public $indexPointer;
public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null)
{
$this->docFreq = $docFreq;
$this->freqPointer = $freqPointer;
$this->proxPointer = $proxPointer;
$this->skipOffset = $skipOffset;
$this->indexPointer = $indexPointer;
}
}
PK =hH[j��4O 4O SegmentWriter.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Expert: The fraction of terms in the "dictionary" which should be stored
* in RAM. Smaller values use more memory, but make searching slightly
* faster, while larger values use less memory and make searching slightly
* slower. Searching is typically not dominated by dictionary lookup, so
* tweaking this is rarely useful.
*
* @var integer
*/
public static $indexInterval = 128;
/**
* Expert: The fraction of TermDocs entries stored in skip tables.
* Larger values result in smaller indexes, greater acceleration, but fewer
* accelerable cases, while smaller values result in bigger indexes,
* less acceleration and more
* accelerable cases. More detailed experiments would be useful here.
*
* 0x7FFFFFFF indicates that we don't use skip data
*
* Note: not used in current implementation
*
* @var integer
*/
public static $skipInterval = 0x7FFFFFFF;
/**
* Expert: The maximum number of skip levels. Smaller values result in
* slightly smaller indexes, but slower skipping in big posting lists.
*
* 0 indicates that we don't use skip data
*
* Note: not used in current implementation
*
* @var integer
*/
public static $maxSkipLevels = 0;
/**
* Number of docs in a segment
*
* @var integer
*/
protected $_docCount = 0;
/**
* Segment name
*
* @var string
*/
protected $_name;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
protected $_directory;
/**
* List of the index files.
* Used for automatic compound file generation
*
* @var unknown_type
*/
protected $_files = array();
/**
* Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
*
* @var array
*/
protected $_fields = array();
/**
* Normalization factors.
* An array fieldName => normVector
* normVector is a binary string.
* Each byte corresponds to an indexed document in a segment and
* encodes normalization factor (float value, encoded by
* Zend_Search_Lucene_Search_Similarity::encodeNorm())
*
* @var array
*/
protected $_norms = array();
/**
* '.fdx' file - Stored Fields, the field index.
*
* @var Zend_Search_Lucene_Storage_File
*/
protected $_fdxFile = null;
/**
* '.fdt' file - Stored Fields, the field data.
*
* @var Zend_Search_Lucene_Storage_File
*/
protected $_fdtFile = null;
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
$this->_directory = $directory;
$this->_name = $name;
}
/**
* Add field to the segment
*
* Returns actual field number
*
* @param Zend_Search_Lucene_Field $field
* @return integer
*/
public function addField(Zend_Search_Lucene_Field $field)
{
if (!isset($this->_fields[$field->name])) {
$fieldNumber = count($this->_fields);
$this->_fields[$field->name] =
new Zend_Search_Lucene_Index_FieldInfo($field->name,
$field->isIndexed,
$fieldNumber,
$field->storeTermVector);
return $fieldNumber;
} else {
$this->_fields[$field->name]->isIndexed |= $field->isIndexed;
$this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
return $this->_fields[$field->name]->number;
}
}
/**
* Add fieldInfo to the segment
*
* Returns actual field number
*
* @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
* @return integer
*/
public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
{
if (!isset($this->_fields[$fieldInfo->name])) {
$fieldNumber = count($this->_fields);
$this->_fields[$fieldInfo->name] =
new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
$fieldInfo->isIndexed,
$fieldNumber,
$fieldInfo->storeTermVector);
return $fieldNumber;
} else {
$this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
$this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
return $this->_fields[$fieldInfo->name]->number;
}
}
/**
* Returns array of FieldInfo objects.
*
* @return array
*/
public function getFieldInfos()
{
return $this->_fields;
}
/**
* Add stored fields information
*
* @param array $storedFields array of Zend_Search_Lucene_Field objects
*/
public function addStoredFields($storedFields)
{
if (!isset($this->_fdxFile)) {
$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx';
$this->_files[] = $this->_name . '.fdt';
}
$this->_fdxFile->writeLong($this->_fdtFile->tell());
$this->_fdtFile->writeVInt(count($storedFields));
foreach ($storedFields as $field) {
$this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
$fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
($field->isBinary ? 0x02 : 0x00) |
0x00; /* 0x04 - third bit, compressed (ZLIB) */
$this->_fdtFile->writeByte($fieldBits);
if ($field->isBinary) {
$this->_fdtFile->writeVInt(strlen($field->value));
$this->_fdtFile->writeBytes($field->value);
} else {
$this->_fdtFile->writeString($field->getUtf8Value());
}
}
$this->_docCount++;
}
/**
* Returns the total number of documents in this segment.
*
* @return integer
*/
public function count()
{
return $this->_docCount;
}
/**
* Return segment name
*
* @return string
*/
public function getName()
{
return $this->_name;
}
/**
* Dump Field Info (.fnm) segment file
*/
protected function _dumpFNM()
{
$fnmFile = $this->_directory->createFile($this->_name . '.fnm');
$fnmFile->writeVInt(count($this->_fields));
$nrmFile = $this->_directory->createFile($this->_name . '.nrm');
// Write header
$nrmFile->writeBytes('NRM');
// Write format specifier
$nrmFile->writeByte((int)0xFF);
foreach ($this->_fields as $field) {
$fnmFile->writeString($field->name);
$fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
($field->storeTermVector ? 0x02 : 0x00)
// not supported yet 0x04 /* term positions are stored with the term vectors */ |
// not supported yet 0x08 /* term offsets are stored with the term vectors */ |
);
if ($field->isIndexed) {
// pre-2.1 index mode (not used now)
// $normFileName = $this->_name . '.f' . $field->number;
// $fFile = $this->_directory->createFile($normFileName);
// $fFile->writeBytes($this->_norms[$field->name]);
// $this->_files[] = $normFileName;
$nrmFile->writeBytes($this->_norms[$field->name]);
}
}
$this->_files[] = $this->_name . '.fnm';
$this->_files[] = $this->_name . '.nrm';
}
/**
* Term Dictionary file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_tisFile = null;
/**
* Term Dictionary index file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_tiiFile = null;
/**
* Frequencies file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_frqFile = null;
/**
* Positions file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_prxFile = null;
/**
* Number of written terms
*
* @var integer
*/
private $_termCount;
/**
* Last saved term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_prevTerm;
/**
* Last saved term info
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_prevTermInfo;
/**
* Last saved index term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_prevIndexTerm;
/**
* Last saved index term info
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_prevIndexTermInfo;
/**
* Last term dictionary file position
*
* @var integer
*/
private $_lastIndexPosition;
/**
* Create dicrionary, frequency and positions files and write necessary headers
*/
public function initializeDictionaryFiles()
{
$this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
$this->_tisFile->writeInt((int)0xFFFFFFFD);
$this->_tisFile->writeLong(0 /* dummy data for terms count */);
$this->_tisFile->writeInt(self::$indexInterval);
$this->_tisFile->writeInt(self::$skipInterval);
$this->_tisFile->writeInt(self::$maxSkipLevels);
$this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
$this->_tiiFile->writeInt((int)0xFFFFFFFD);
$this->_tiiFile->writeLong(0 /* dummy data for terms count */);
$this->_tiiFile->writeInt(self::$indexInterval);
$this->_tiiFile->writeInt(self::$skipInterval);
$this->_tiiFile->writeInt(self::$maxSkipLevels);
/** Dump dictionary header */
$this->_tiiFile->writeVInt(0); // preffix length
$this->_tiiFile->writeString(''); // suffix
$this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
$this->_tiiFile->writeByte((int)0x0F);
$this->_tiiFile->writeVInt(0); // DocFreq
$this->_tiiFile->writeVInt(0); // FreqDelta
$this->_tiiFile->writeVInt(0); // ProxDelta
$this->_tiiFile->writeVInt(24); // IndexDelta
$this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
$this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
$this->_files[] = $this->_name . '.tis';
$this->_files[] = $this->_name . '.tii';
$this->_files[] = $this->_name . '.frq';
$this->_files[] = $this->_name . '.prx';
$this->_prevTerm = null;
$this->_prevTermInfo = null;
$this->_prevIndexTerm = null;
$this->_prevIndexTermInfo = null;
$this->_lastIndexPosition = 24;
$this->_termCount = 0;
}
/**
* Add term
*
* Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
*
* @param Zend_Search_Lucene_Index_Term $termEntry
* @param array $termDocs
*/
public function addTerm($termEntry, $termDocs)
{
$freqPointer = $this->_frqFile->tell();
$proxPointer = $this->_prxFile->tell();
$prevDoc = 0;
foreach ($termDocs as $docId => $termPositions) {
$docDelta = ($docId - $prevDoc)*2;
$prevDoc = $docId;
if (count($termPositions) > 1) {
$this->_frqFile->writeVInt($docDelta);
$this->_frqFile->writeVInt(count($termPositions));
} else {
$this->_frqFile->writeVInt($docDelta + 1);
}
$prevPosition = 0;
foreach ($termPositions as $position) {
$this->_prxFile->writeVInt($position - $prevPosition);
$prevPosition = $position;
}
}
if (count($termDocs) >= self::$skipInterval) {
/**
* @todo Write Skip Data to a freq file.
* It's not used now, but make index more optimal
*/
$skipOffset = $this->_frqFile->tell() - $freqPointer;
} else {
$skipOffset = 0;
}
$term = new Zend_Search_Lucene_Index_Term($termEntry->text,
$this->_fields[$termEntry->field]->number);
$termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
$freqPointer, $proxPointer, $skipOffset);
$this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
if (($this->_termCount + 1) % self::$indexInterval == 0) {
$this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
$indexPosition = $this->_tisFile->tell();
$this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
$this->_lastIndexPosition = $indexPosition;
}
$this->_termCount++;
}
/**
* Close dictionary
*/
public function closeDictionaryFiles()
{
$this->_tisFile->seek(4);
$this->_tisFile->writeLong($this->_termCount);
$this->_tiiFile->seek(4);
// + 1 is used to count an additional special index entry (empty term at the start of the list)
$this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
}
/**
* Dump Term Dictionary segment file entry.
* Used to write entry to .tis or .tii files
*
* @param Zend_Search_Lucene_Storage_File $dicFile
* @param Zend_Search_Lucene_Index_Term $prevTerm
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
* @param Zend_Search_Lucene_Index_TermInfo $termInfo
*/
protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
&$prevTerm, Zend_Search_Lucene_Index_Term $term,
&$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
{
if (isset($prevTerm) && $prevTerm->field == $term->field) {
$matchedBytes = 0;
$maxBytes = min(strlen($prevTerm->text), strlen($term->text));
while ($matchedBytes < $maxBytes &&
$prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
$matchedBytes++;
}
// Calculate actual matched UTF-8 pattern
$prefixBytes = 0;
$prefixChars = 0;
while ($prefixBytes < $matchedBytes) {
$charBytes = 1;
if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($term->text[$prefixBytes]) & 0x20 ) {
$charBytes++;
if (ord($term->text[$prefixBytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($prefixBytes + $charBytes > $matchedBytes) {
// char crosses matched bytes boundary
// skip char
break;
}
$prefixChars++;
$prefixBytes += $charBytes;
}
// Write preffix length
$dicFile->writeVInt($prefixChars);
// Write suffix
$dicFile->writeString(substr($term->text, $prefixBytes));
} else {
// Write preffix length
$dicFile->writeVInt(0);
// Write suffix
$dicFile->writeString($term->text);
}
// Write field number
$dicFile->writeVInt($term->field);
// DocFreq (the count of documents which contain the term)
$dicFile->writeVInt($termInfo->docFreq);
$prevTerm = $term;
if (!isset($prevTermInfo)) {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer);
} else {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
}
// Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
if ($termInfo->skipOffset != 0) {
$dicFile->writeVInt($termInfo->skipOffset);
}
$prevTermInfo = $termInfo;
}
/**
* Generate compound index file
*/
protected function _generateCFS()
{
$cfsFile = $this->_directory->createFile($this->_name . '.cfs');
$cfsFile->writeVInt(count($this->_files));
$dataOffsetPointers = array();
foreach ($this->_files as $fileName) {
$dataOffsetPointers[$fileName] = $cfsFile->tell();
$cfsFile->writeLong(0); // write dummy data
$cfsFile->writeString($fileName);
}
foreach ($this->_files as $fileName) {
// Get actual data offset
$dataOffset = $cfsFile->tell();
// Seek to the data offset pointer
$cfsFile->seek($dataOffsetPointers[$fileName]);
// Write actual data offset value
$cfsFile->writeLong($dataOffset);
// Seek back to the end of file
$cfsFile->seek($dataOffset);
$dataFile = $this->_directory->getFileObject($fileName);
$byteCount = $this->_directory->fileLength($fileName);
while ($byteCount > 0) {
$data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
$byteCount -= strlen($data);
$cfsFile->writeBytes($data);
}
$this->_directory->deleteFile($fileName);
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
abstract public function close();
}
PK =hH[�i��
FieldInfo.phpnu &1i� <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_FieldInfo
{
public $name;
public $isIndexed;
public $number;
public $storeTermVector;
public $normsOmitted;
public $payloadsStored;
public function __construct($name, $isIndexed, $number, $storeTermVector, $normsOmitted = false, $payloadsStored = false)
{
$this->name = $name;
$this->isIndexed = $isIndexed;
$this->number = $number;
$this->storeTermVector = $storeTermVector;
$this->normsOmitted = $normsOmitted;
$this->payloadsStored = $payloadsStored;
}
}
PK =hH[��T T Term.phpnu &1i� PK =hH[χ�m� � � SegmentInfo.phpnu &1i� PK =hH[(t�%F F � SegmentInfoPriorityQueue.phpnu &1i� PK =hH[�>;�� � ' SegmentWriter/StreamWriter.phpnu &1i� PK =hH[#$ �� � [3 SegmentWriter/DocumentWriter.phpnu &1i� PK =hH[���cX� X�
�R Writer.phpnu &1i� PK =hH[!��( ( � DictionaryLoader.phpnu &1i� PK =hH[��� � i DocsFilter.phpnu &1i� PK =hH[堵 # # : SegmentMerger.phpnu &1i� PK =hH[3R�G �1 TermInfo.phpnu &1i� PK =hH[j��4O 4O �9 SegmentWriter.phpnu &1i� PK =hH[�i��
8� FieldInfo.phpnu &1i� PK � ��