plda源碼(十)

plda源碼(十)

Sparse LDA

StandardGibbs採樣公式如下

q(z)=nk,¬it+βnk,¬i+βV(nm,¬ik+αk)\begin{aligned} q(z) &= \frac{n^{t}_{k,\neg i} + \beta}{n_{k,\neg i} + \beta V}(n^{k}_{m,\neg i} + \alpha_k) \end{aligned}

nktn^{t}_{k} 是單詞t在主題k上的分佈, nmkn^{k}_{m}是文檔m在主題k上的分佈, nkn_{k}是所有單詞在主題k上的分佈

每個採樣迭代複雜度是O(MNmK)O(M\overline{N_m}K), ( Nm\overline{N_m} 表示訓練文檔的平均長度);內存消耗主要在 nmkn^{k}_{m}nktn^{t}_{k},假設都採用Dense存儲,內存複雜度是 O(K(M+V))O\left(K(M+V)\right)

sparseLDA

因爲nmkn^{k}_{m}nktn^{t}_{k}一般很稀疏,SparseLDA 將 QQ變形
Q=knk,¬it+βnk,¬i+βV(nm,¬ik+αk)=k(nk,¬it(nm,¬ik+αk)nk,¬i+βV+βnm,¬iknk,¬i+βV+βαknk,¬i+βV)=knk,¬it(nm,¬ik+αk)nk,¬i+βVE+kβnm,¬iknk,¬i+βVF+kβαknk,¬i+βVGQ = \sum_k \frac{n^{t}_{k,\neg i} + \beta}{n_{k,\neg i} + \beta V}\left(n^{k}_{m,\neg i} + \alpha_k\right) \\ = \sum_k \left(\frac{n^{t}_{k,\neg i}\left(n^{k}_{m,\neg i} + \alpha_k\right)}{n_{k,\neg i} + \beta V} + \frac{\beta n^{k}_{m,\neg i}}{n_{k,\neg i} + \beta V} + \frac{\beta \alpha_k}{n_{k,\neg i} + \beta V}\right) \\ = \underbrace{\sum_k \frac{n^{t}_{k,\neg i}\left(n^{k}_{m,\neg i} + \alpha_k\right)}{n_{k,\neg i} + \beta V}}_E + \underbrace{\sum_k \frac{\beta n^{k}_{m,\neg i}}{n_{k,\neg i} + \beta V}}_F + \underbrace{\sum_k \frac{\beta \alpha_k}{n_{k,\neg i} + \beta V}}_G

E=ke(k)E=\sum_k e(k)F=kf(k)F=\sum_k f(k)G=kg(k)G=\sum_k g(k)。其中 EE 包含 Nonzero(nk,¬it)|Nonzero(n^{t}_{k,\neg i})|項,稱爲“topic word”桶;FF 包含 Nonzero(nm,¬ik)|Nonzero(n^{k}_{m,\neg i})|項,稱爲“document topic” 桶;GG 包含 KK 項,稱爲“smoothing only”桶。

c(z=k)=nm,¬ik+αknk,¬i+βVc(z=k) = \frac{n^{k}_{m,\neg i} + \alpha_k}{n_{k,\neg i} + \beta V}
e(z=k)=nk,¬itc(k)e(z=k) = n^{t}_{k,\neg i}c(k)
f(z=k)=βnm,¬iknk,¬i+βVf(z=k) = \frac{\beta n^{k}_{m,\neg i}}{n_{k,\neg i} + \beta V}
g(z=k)=βαknk,¬i+βVg(z=k) = \frac{\beta \alpha_k}{n_{k,\neg i} + \beta V}

採樣文檔中詞的主題時,首先計算 Q=E+F+GQ=E+F+G,同時生成一個隨機變量UUniform(0,Q)U∼Uniform(0,Q),然後在三個桶裏進行具體的主題採樣:

U<EU<E,主題落在“topic word” 桶;
U<E+FU<E+F,主題落在“document topic”桶;
否則,主題落在“smoothing only”桶。

class SparseLDASampler : public BaseSampler {

 private:
  vector<double> e_;
  vector<double> f_;
  vector<double> g_;
  double G;
  vector<double> c_;
  map<int64, int32> n_mk;
  map<int32, int64> n_kw;
};
        e_.resize(model_->num_topics());
        f_.resize(model_->num_topics());
        g_.resize(model_->num_topics());
        c_.resize(model_->num_topics());
    
    //初始化
        G = 0;
        double betaV = vocab_size * beta_;
        for (int i = 0; i < num_topics; i++) {
            c_[i] = alpha_ /
                    (model_->GetGlobalTopicDistribution()[i] + betaV);
            g_[i] = (alpha_ * beta_) / (model_->GetGlobalTopicDistribution()[i] + betaV);
            G += g_[i];
        }
    

  //一個文檔的一次採樣
        double F = 0;
        int vocab_size;//詞典大小
        double betaV = vocab_size * beta_;
        unordered_map<int, int> n_mk_map = document->GetTopicDist();//文檔的topic->count,稀疏
        for (iter = n_mk_map.begin();iter != n_mk_map.end(); iter++) {
               int topic = iter->first;
                c_[topic] = (document->topic_distribution()[topic] + alpha_) /
                        (model_->GlobalTopicDistribution()[topic] + betaV);//該文檔沒有出現的topic的c函數爲alpha/(global[topic] +betaV)
                f_[topic] = (beta_ * document->topic_distribution()[topic]) /
                        (model_->GetGlobalTopicDistribution()[topic] + betaV);//未出現的topic的f函數爲0
                F += f_[topic];
        }
        for (iterator(document);
                !iterator.Done(); iterator.Next()) {//在文檔word上的迭代
            int current_topic = iterator.Topic();
            int old_topic = current_topic;
            double numer_mk = document->topic_distribution()[old_topic] - 1;
            double denom = model_->GetGlobalTopicDistribution()[old_topic] - 1 + betaV;
            // update all the statistic associate with old_topic
            c_[old_topic] = (numer_mk + alpha_) / denom;
            double f_update = beta_ * numer_mk / denom - beta_ * (numer_mk + 1) / (denom + 1);
            f_[old_topic] += f_update;
            F = F + f_update;
            double g_update = beta_ * alpha_ * (1 / denom - 1/ (denom + 1));
            g_[old_topic] += g_update;
            G += g_update;
            double E = 0;
            unordered_map<int, int> n_kw_map = model_->GetTopicDistByWord(iterator.Word());//word的topic->count,稀疏
            for (kw_iter = n_kw_map.begin();kw_iter != n_kw_map.end(); kw_iter++) {
                int topic_tmp = kw_iter->first;
                int n_kw_factor = kw_iter->second;
                if (topic_tmp == old_topic) {
                    n_kw_factor -= 1;
                }
                e_[topic_tmp] = n_kw_factor * c_[topic_tmp];
                E += e_[topic_tmp];
            }

            double total = E + F + G;
            n_mk_map = document->GetTopicDist();
            double choice = random->RandDouble() * total;
            int new_topic = -1;
            if (choice < E) {
                new_topic = SampleInBucketWithMap(n_kw_map, e_, choice);
            } else if (choice < E + F) {
                new_topic = SampleInBucketWithMap(n_mk_map, f_, choice - E);
            } else {
                new_topic = SampleInSmoothBucket(g_, choice - E - F);
            }

            if (update_model) {
                model_->ReassignTopic(iterator.Word(), iterator.Topic(), new_topic, 1);
            }
            iterator.SetTopic(new_topic);

            // update all the statistic associate with new_topic
            numer_mk = document->topic_distribution()[new_topic];
            denom = model_->GetGlobalTopicDistribution()[new_topic] + betaV;
            c_[new_topic] = (numer_mk + alpha_) / denom;
            f_update = beta_ * numer_mk / denom - beta_ * (numer_mk - 1) / (denom - 1);
            f_[new_topic] += f_update;
            F = F + f_update;
            g_update = beta_ * alpha_ * (1 / denom - 1/ (denom - 1));
            g_[new_topic] += g_update;
            G += g_update;
        }
        n_mk_map = document->GetTopicDist();
        iter = n_mk_map.begin();
        for (;iter != n_mk_map.end(); iter++) {
               int topic = iter->first;
               c_[topic] = alpha_ / (model_->GetGlobalTopicDistribution()[topic] + betaV);
        }
    

    int SparseLDASampler::SampleInBucketWithMap(unordered_map<int, int>& map,
            vector<double>& vec_, double choice) const {
        double sum_so_far = 0.0;
        for (const auto &kv : map) {
            sum_so_far += vec_[kv.first];
            if (sum_so_far >= choice) {
                return kv.first;
            }
        }
    }

    int SparseLDASampler::SampleInSmoothBucket(
            const vector<double>& distribution, double choice) const {
        double sum_so_far = 0.0;
        for (int i = 0; i < distribution.size(); ++i) {
            sum_so_far += distribution[i];
            if (sum_so_far >= choice) {
                return i;
            }
        }
    }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章