plda源碼(九)

plda源碼(九)

BaseSampler是把Sampler抽象一下,添加詞相識度和爲新採樣方法提供接口

    class BaseSampler {
    public:
        BaseSampler(double alpha, double beta,
                LDAModel* model,
                LDAAccumulativeModel* accum_model);

        virtual void InitModelGivenTopics(const LDACorpus& corpus);

        virtual void SpecificInit(LDAModel*) {
        }

        virtual void DoIteration(Random* random, LDACorpus* corpus, bool train_model, bool burn_in);

        virtual void SampleNewTopicsForDocument(Random* random, LDADocument* document,
                bool update_model) = 0;

        virtual double LogLikelihood(LDADocument* document) const;
        virtual double LogLikelihood(LDACorpus* document) const;

    protected:
        inline double GetFactorInModel(int w, int k, int ajustment) const {//w相似的wod並在主題k上有分佈的相似度乘積
            ElemIter iter = matrix->GetRowWithIterUnsorted(w);
            double res = 1.0;
            while (iter.HasNext()) {
                double sim;
                int word;
                iter.GetNext(sim, word);
                const TopicDistribution<int32>& word_distribution = model_->GetWordTopicDistribution(word);
                if (word_distribution[k] > 0) {
                    res *= sim;
                    if (use_model_weight) {
                        res *= GetModelWeight(word, k, ajustment);
                    }
                }
            }
            return res;
        }

        inline double GetFactorInDoc(int word, int k, const LDADocument* document) const {
        //word的相似詞並在文檔document中有主題k分佈的相似度乘積
            ElemIter iter = matrix->GetRowWithIterUnsorted(word);
            double res = 1.0;
            while (iter.HasNext()) {
                double sim;
                int word;
                iter.GetNext(sim, word);
                if (HasSameTopic(word, k, document)) {
                    res *= sim;
                }
            }
            return res;
        }

        inline bool HasSameTopic(int word, int k, const LDADocument* document) const {
        //word在document中是否有topic k分佈
            ......
            return true;
        }

        inline double GetModelWeight(int word, int k, int ajustment) const {
        //exp(word在主題k上分佈/(所有單詞在主題k上分佈+adjust))
            const TopicDistribution<int32>& word_distribution;
            const TopicDistribution<int32>& global_distribution;
            double relative_weight = 1.0;
            if (global_distribution[k] + ajustment > 0) {
                relative_weight = ((double) word_distribution[k]) / (global_distribution[k] + ajustment);
            }
            double result = fast_exp(relative_weight);//exp
            return result < 1.0 ? 1.0 : result;
        }

    const double alpha_;
    const double beta_;
    LDAModel* model_;
    LDAAccumulativeModel* accum_model_;

    WordMatrix* matrix;
    int mode;
    bool use_model_weight;
    const int DOC_MODE = 1;
    const int MODEL_MODE = 2;
};

NaiveSampler
在Gibbs基礎上添加了詞相似度的採樣

class NaiveSampler : public BaseSampler

void NaiveSampler::GenerateTopicDistributionForWord(
  ...
    double origin_gibbs_score = (topic_word_factor + beta_) *
        (document_topic_factor + alpha_) /
        (global_topic_factor + vocab_size * beta_);
    double ajust_prob = ComputeAjustProb(word, k, origin_gibbs_score, &document, current_topic_adjustment);
    distribution->push_back(ajust_prob);
  }
}

double NaiveSampler::ComputeAjustProb(int word, int iter_topic, double origin_gibbs_score, const LDADocument* document,
        int current_topic_adjustment) const {
    if (mode == DOC_MODE) {
        return origin_gibbs_score * GetFactorInDoc(word, iter_topic, document);//對同一文檔,詞語相似落在同一topic上的有加成
    } else if (mode == MODEL_MODE) {
        return origin_gibbs_score * GetFactorInModel(word, iter_topic, current_topic_adjustment);//全局模式,詞語相似落在同一topic上的有加成
    }
    return origin_gibbs_score;
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章