/** Zend_Search_Lucene_Index_FieldInfo */
require_once 'Zend/Search/Lucene/Index/FieldInfo.php';

/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';

/** Zend_Search_Lucene_Index_TermInfo */
require_once 'Zend/Search/Lucene/Index/TermInfo.php';

abstract class Zend_Search_Lucene_Index_SegmentWriter
     * Expert: The fraction of terms in the "dictionary" which should be stored
     * in RAM.  Smaller values use more memory, but make searching slightly
     * faster, while larger values use less memory and make searching slightly
     * slower.  Searching is typically not dominated by dictionary lookup, so
     * tweaking this is rarely useful.
     * @var integer
    public static $indexInterval = 128;

     * Expert: The fraction of TermDocs entries stored in skip tables.
     * Larger values result in smaller indexes, greater acceleration, but fewer
     * accelerable cases, while smaller values result in bigger indexes,
     * less acceleration and more
     * accelerable cases. More detailed experiments would be useful here.
     * 0x7FFFFFFF indicates that we don't use skip data
     * Note: not used in current implementation
     * @var integer
    public static $skipInterval = 0x7FFFFFFF;

     * Expert: The maximum number of skip levels. Smaller values result in
     * slightly smaller indexes, but slower skipping in big posting lists.
     * 0 indicates that we don't use skip data
     * Note: not used in current implementation
     * @var integer
    public static $maxSkipLevels = 0;

     * Number of docs in a segment
     * @var integer
    protected $_docCount = 0;

     * Segment name
     * @var string
    protected $_name;

     * File system adapter.
     * @var Zend_Search_Lucene_Storage_Directory
    protected $_directory;

     * List of the index files.
     * Used for automatic compound file generation
     * @var unknown_type
    protected $_files = array();

     * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
     * @var array
    protected $_fields = array();

     * Normalization factors.
     * An array fieldName => normVector
     * normVector is a binary string.
     * Each byte corresponds to an indexed document in a segment and
     * encodes normalization factor (float value, encoded by
     * Zend_Search_Lucene_Search_Similarity::encodeNorm())
     * @var array
    protected $_norms = array();

     * '.fdx'  file - Stored Fields, the field index.
     * @var Zend_Search_Lucene_Storage_File
    protected $_fdxFile = null;

     * '.fdt'  file - Stored Fields, the field data.
     * @var Zend_Search_Lucene_Storage_File
    protected $_fdtFile = null;

     * Object constructor.
     * @param Zend_Search_Lucene_Storage_Directory $directory
     * @param string $name
    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
        $this->_directory = $directory;
        $this->_name      = $name;

     * Add field to the segment
     * Returns actual field number
     * @param Zend_Search_Lucene_Field $field
     * @return integer
    public function addField(Zend_Search_Lucene_Field $field)
        if (!isset($this->_fields[$field->name])) {
            $fieldNumber = count($this->_fields);
            $this->_fields[$field->name] =
                                new Zend_Search_Lucene_Index_FieldInfo($field->name,

            return $fieldNumber;
        } else {
            $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
            $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;

            return $this->_fields[$field->name]->number;

     * Add fieldInfo to the segment
     * Returns actual field number
     * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
     * @return integer
    public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
        if (!isset($this->_fields[$fieldInfo->name])) {
            $fieldNumber = count($this->_fields);
            $this->_fields[$fieldInfo->name] =
                                new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,

            return $fieldNumber;
        } else {
            $this->_fields[$fieldInfo->name]->isIndexed       |= $fieldInfo->isIndexed;
            $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;

            return $this->_fields[$fieldInfo->name]->number;

     * Returns array of FieldInfo objects.
     * @return array
    public function getFieldInfos()
        return $this->_fields;

     * Add stored fields information
     * @param array $storedFields array of Zend_Search_Lucene_Field objects
    public function addStoredFields($storedFields)
        if (!isset($this->_fdxFile)) {
            $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
            $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');

            $this->_files[] = $this->_name . '.fdx';
            $this->_files[] = $this->_name . '.fdt';

        foreach ($storedFields as $field) {
            $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
                         ($field->isBinary ?    0x02 : 0x00) |
                         0x00; /* 0x04 - third bit, compressed (ZLIB) */
            if ($field->isBinary) {
            } else {


     * Returns the total number of documents in this segment.
     * @return integer
    public function count()
        return $this->_docCount;

     * Return segment name
     * @return string
    public function getName()
        return $this->_name;

     * Dump Field Info (.fnm) segment file
    protected function _dumpFNM()
        $fnmFile = $this->_directory->createFile($this->_name . '.fnm');

        $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
        // Write header
        // Write format specifier

        foreach ($this->_fields as $field) {
            $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) |
                                ($field->storeTermVector ? 0x02 : 0x00)
// not supported yet            0x04 /* term positions are stored with the term vectors */ |
// not supported yet            0x08 /* term offsets are stored with the term vectors */   |

            if ($field->isIndexed) {
                // pre-2.1 index mode (not used now)
                // $normFileName = $this->_name . '.f' . $field->number;
                // $fFile = $this->_directory->createFile($normFileName);
                // $fFile->writeBytes($this->_norms[$field->name]);
                // $this->_files[] = $normFileName;


        $this->_files[] = $this->_name . '.fnm';
        $this->_files[] = $this->_name . '.nrm';

     * Term Dictionary file
     * @var Zend_Search_Lucene_Storage_File
    private $_tisFile = null;

     * Term Dictionary index file
     * @var Zend_Search_Lucene_Storage_File
    private $_tiiFile = null;

     * Frequencies file
     * @var Zend_Search_Lucene_Storage_File
    private $_frqFile = null;

     * Positions file
     * @var Zend_Search_Lucene_Storage_File
    private $_prxFile = null;

     * Number of written terms
     * @var integer
    private $_termCount;

     * Last saved term
     * @var Zend_Search_Lucene_Index_Term
    private $_prevTerm;

     * Last saved term info
     * @var Zend_Search_Lucene_Index_TermInfo
    private $_prevTermInfo;

     * Last saved index term
     * @var Zend_Search_Lucene_Index_Term
    private $_prevIndexTerm;

     * Last saved index term info
     * @var Zend_Search_Lucene_Index_TermInfo
    private $_prevIndexTermInfo;

     * Last term dictionary file position
     * @var integer
    private $_lastIndexPosition;

     * Create dicrionary, frequency and positions files and write necessary headers
    public function initializeDictionaryFiles()
        $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
        $this->_tisFile->writeLong(0 /* dummy data for terms count */);

        $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
        $this->_tiiFile->writeLong(0 /* dummy data for terms count */);

        /** Dump dictionary header */
        $this->_tiiFile->writeVInt(0);                    // preffix length
        $this->_tiiFile->writeString('');                 // suffix
        $this->_tiiFile->writeInt((int)0xFFFFFFFF);       // field number
        $this->_tiiFile->writeVInt(0);                    // DocFreq
        $this->_tiiFile->writeVInt(0);                    // FreqDelta
        $this->_tiiFile->writeVInt(0);                    // ProxDelta
        $this->_tiiFile->writeVInt(24);                   // IndexDelta

        $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
        $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');

        $this->_files[] = $this->_name . '.tis';
        $this->_files[] = $this->_name . '.tii';
        $this->_files[] = $this->_name . '.frq';
        $this->_files[] = $this->_name . '.prx';

        $this->_prevTerm          = null;
        $this->_prevTermInfo      = null;
        $this->_prevIndexTerm     = null;
        $this->_prevIndexTermInfo = null;
        $this->_lastIndexPosition = 24;
        $this->_termCount         = 0;


     * Add term
     * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
     * @param Zend_Search_Lucene_Index_Term $termEntry
     * @param array $termDocs
    public function addTerm($termEntry, $termDocs)
        $freqPointer = $this->_frqFile->tell();
        $proxPointer = $this->_prxFile->tell();

        $prevDoc = 0;
        foreach ($termDocs as $docId => $termPositions) {
            $docDelta = ($docId - $prevDoc)*2;
            $prevDoc = $docId;
            if (count($termPositions) > 1) {
            } else {
                $this->_frqFile->writeVInt($docDelta + 1);

            $prevPosition = 0;
            foreach ($termPositions as $position) {
                $this->_prxFile->writeVInt($position - $prevPosition);
                $prevPosition = $position;

        if (count($termDocs) >= self::$skipInterval) {
             * @todo Write Skip Data to a freq file.
             * It's not used now, but make index more optimal
            $skipOffset = $this->_frqFile->tell() - $freqPointer;
        } else {
            $skipOffset = 0;

        $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
        $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
                                                          $freqPointer, $proxPointer, $skipOffset);

        $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);

        if (($this->_termCount + 1) % self::$indexInterval == 0) {
            $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);

            $indexPosition = $this->_tisFile->tell();
            $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
            $this->_lastIndexPosition = $indexPosition;


     * Close dictionary
    public function closeDictionaryFiles()

        // + 1 is used to count an additional special index entry (empty term at the start of the list)
        $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);

     * Dump Term Dictionary segment file entry.
     * Used to write entry to .tis or .tii files
     * @param Zend_Search_Lucene_Storage_File $dicFile
     * @param Zend_Search_Lucene_Index_Term $prevTerm
     * @param Zend_Search_Lucene_Index_Term $term
     * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
     * @param Zend_Search_Lucene_Index_TermInfo $termInfo
    protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
                                        &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
                                        &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
        if (isset($prevTerm) && $prevTerm->field == $term->field) {
            $matchedBytes = 0;
            $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
            while ($matchedBytes < $maxBytes  &&
                   $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {

            // Calculate actual matched UTF-8 pattern
            $prefixBytes = 0;
            $prefixChars = 0;
            while ($prefixBytes < $matchedBytes) {
                $charBytes = 1;
                if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
                    if (ord($term->text[$prefixBytes]) & 0x20 ) {
                        if (ord($term->text[$prefixBytes]) & 0x10 ) {

                if ($prefixBytes + $charBytes > $matchedBytes) {
                    // char crosses matched bytes boundary
                    // skip char

                $prefixBytes += $charBytes;

            // Write preffix length
            // Write suffix
            $dicFile->writeString(substr($term->text, $prefixBytes));
        } else {
            // Write preffix length
            // Write suffix
        // Write field number
        // DocFreq (the count of documents which contain the term)

        $prevTerm = $term;

        if (!isset($prevTermInfo)) {
            // Write FreqDelta
            // Write ProxDelta
        } else {
            // Write FreqDelta
            $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
            // Write ProxDelta
            $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
        // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
        if ($termInfo->skipOffset != 0) {

        $prevTermInfo = $termInfo;

     * Generate compound index file
    protected function _generateCFS()
        $cfsFile = $this->_directory->createFile($this->_name . '.cfs');

        $dataOffsetPointers = array();
        foreach ($this->_files as $fileName) {
            $dataOffsetPointers[$fileName] = $cfsFile->tell();
            $cfsFile->writeLong(0); // write dummy data

        foreach ($this->_files as $fileName) {
            // Get actual data offset
            $dataOffset = $cfsFile->tell();
            // Seek to the data offset pointer
            // Write actual data offset value
            // Seek back to the end of file

            $dataFile = $this->_directory->getFileObject($fileName);

            $byteCount = $this->_directory->fileLength($fileName);
            while ($byteCount > 0) {
                $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
                $byteCount -= strlen($data);


     * Close segment, write it to disk and return segment info
     * @return Zend_Search_Lucene_Index_SegmentInfo
    abstract public function close();