C++強化學習通過Python bindings接OpenAI Gym

OpenAI gym是強化學習最爲流行的實驗環境。某種程度上,其接口已經成爲了標準。一方面,很多算法實現都是基於gym開發;另一方面,新的場景也會封裝成gym接口。經過這樣一層抽象,算法與實驗環境充分解耦隔離,可以方便地自由組合。但gym是python的接口,如果想用C++實現強化學習算法,則無法直接與gym相接。一種方案是跨進程:一個進程運行python環境,另一個進程運行強化學習算法,與環境交互數據經過序列化和反序列化通過IPC進行通信。另一種是單進程方案:gym和強化學習算法跑在同一進程,通過python binding來連接。本文嘗試通過pybind11來橋接,從而實現在同一進程中gym與強化學習算法通信的目的。

C++機器學習框架採用PyTorch提供的Libtorch。因爲在目前主流的幾個訓練框架中,在C++版本上相比下它是算支持地比較好的,安裝也算方便。安裝流程見INSTALLING C++ DISTRIBUTIONS OF PYTORCH。官方提供的sample中提供了一個REINFORCE算法(一種雖然比較古老但很經典的RL算法)的例子reinforce.py,我們就先以它爲例。

首先來看python部分,參考原sample做一些小改動。將強化學習算法調用接口抽象在RLWrapper。這個類後面會binding到C++。初始化時傳入gym環境的狀態和動態空間描述,reset()函數通知環境重置並傳入初始狀態,act()函數根據當前狀態根據策略給出動作,update()函數進行策略函數參數學習。

...
def state_space_desc(space):
    if isinstance(space, gym.spaces.Box):
        assert(type(space.shape)==tuple)
        return dict(stype='Box', dtype=str(space.dtype), shape=space.shape)
    else:
        raise NotImplementedError('unknown state space {}'.format(space))

def action_space_desc(space):
    if isinstance(space, gym.spaces.Discrete):
        return dict(stype='Discrete', dtype=str(space.dtype), shape=(space.n,))
    else:
        raise NotImplementedError('unknown action space {}'.format(space))

def main(args):
    env = gym.make(args.env)
    env.seed(args.seed)

    agent = nativerl.RLWrapper(state_space_desc(env.observation_space),
            action_space_desc(env.action_space))

    running_reward = 10
    for i in range(args.epoch):
        obs = env.reset()
        ep_reward = 0
        agent.reset(obs)
        for t in range(1, args.step):
            if args.render:
                env.render()
            action = agent.act(obs)
            obs, reward, done, info = env.step(action)
            agent.update(reward, done)
            ep_reward += reward
            if done:
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        agent.episode_finish()

        if i % args.log_itv == 0:
            print("Episode {}\t Last reward: {:.2f}\t step: {}\t Average reward: {:.2f}"
                    .format(i, ep_reward, t, running_reward))
        if env.spec.reward_threshold and running_reward > env.spec.reward_threshold:
            print("Solved. Running reward: {}, Last reward: {}".format(running_reward, t))
            break

    env.close()

然後是RLWrapper的python binding部分。這裏主要是將python的對象轉爲C++的數據結構。

...
namespace py = pybind11;

class RLWrapper {
public:
    RLWrapper(const py::dict& state_space, const py::dict& action_space) {
        spdlog::set_level(spdlog::level::info);
        torch::manual_seed(nrl::kSeed);

        nrl::SpaceDesc ss;
        nrl::SpaceDesc as;
        ss.stype = py::cast<std::string>(state_space["stype"]);
        as.stype = py::cast<std::string>(action_space["stype"]);
        ss.dtype = py::cast<std::string>(state_space["dtype"]);
        as.dtype = py::cast<std::string>(action_space["dtype"]);

        py::tuple shape;
        shape = py::cast<py::tuple>(state_space["shape"]);
        for (const auto& item : shape) {
            ss.shape.push_back(py::cast<int64_t>(item));
        }
        shape = py::cast<py::tuple>(action_space["shape"]);
        for (const auto& item : shape) {
            as.shape.push_back(py::cast<int64_t>(item));
        }

        mStateSpaceDesc = ss;
        mActionSpaceDesc = as;
        mAgent = std::make_shared<nrl::Reinforce>(ss, as);
    }

    void reset(py::array_t<float, py::array::c_style | py::array::forcecast> state) {
        py::buffer_info buf = state.request();
        float* pbuf = static_cast<float*>(buf.ptr);
        
        assert(buf.shape == mStateSpaceDesc.shape);
        mAgent->reset(nrl::Blob{pbuf, mStateSpaceDesc.shape});
    }

    py::object act(py::array_t<float, py::array::c_style | py::array::forcecast> state) {
        py::buffer_info buf = state.request();
        float* pbuf = static_cast<float *>(buf.ptr);
        assert(buf.shape == mStateSpaceDesc.shape);
        torch::Tensor action = mAgent->act(nrl::Blob{pbuf, mStateSpaceDesc.shape}).contiguous().cpu();
        return py::int_(action.item<long>());
    }

    void update(float reward, bool done) {
        mAgent->update(reward, done);
    }

    void episode_finish() {
        spdlog::trace("{}", __func__);
        mAgent->onEpisodeFinished();
    }

    ~RLWrapper() {
    }

private:
    nrl::SpaceDesc mStateSpaceDesc;
    nrl::SpaceDesc mActionSpaceDesc;
    std::shared_ptr<nrl::RLBase> mAgent;
};

PYBIND11_MODULE(nativerl, m) {
    py::class_<RLWrapper>(m, "RLWrapper")
        .def(py::init<const py::dict &, const py::dict &>())
        .def("reset", &RLWrapper::reset)
        .def("episode_finish", &RLWrapper::episode_finish)
        .def("act", &RLWrapper::act)
        .def("update", &RLWrapper::update);
}

可以說這是python和C++的glue層。主要的工作我們放到RLBase類中。它是一個抽象類,定義了幾個強化學習的基本接口。我們將REINFORCE算法實現在其繼承類Reinforce中:

...
class Reinforce : public RLBase {
public:
    Reinforce(const SpaceDesc& ss, const SpaceDesc& as)
        : mPolicy(std::make_shared<Policy>(ss, as, mDevice)) {
        mPolicy->to(mDevice);

        mRewards = torch::zeros({mCapacity}, torch::TensorOptions(mDevice));
        mReturns = torch::zeros({mCapacity}, torch::TensorOptions(mDevice));

        mOptimizer = std::make_shared<torch::optim::Adam>(mPolicy->parameters(), 
                torch::optim::AdamOptions(mInitLR));
        }

    virtual torch::Tensor act(const Blob& s) override {
        auto state = torch::from_blob(s.pbuf, s.shape).unsqueeze(0).to(mDevice);
        torch::Tensor action;
        torch::Tensor logProb;
        std::tie(action, logProb) = mPolicy->act(state);
        mLogProbs.push_back(logProb);
        return action;
    }

    void update(float r, __attribute__((unused)) bool done) {
        mRewards[mSize++] = r;
        if (mSize >= mCapacity) {
            spdlog::info("buffer has been full, call train()");
            train();
        }
    }

    virtual void onEpisodeFinished() override {
        train();
    }

private:
    void train() {
        spdlog::trace("{}: buffer size = {}", __func__, mSize);

        for (auto i = mSize - 1; i >= 0; --i) {
            if (i == (mSize - 1)) {
                mReturns[i] = mRewards[i];
            } else {
                mReturns[i] = mReturns[i + 1] * mGamma + mRewards[i];
            }
        } 
        auto returns = mReturns.slice(0, 0, mSize);
        returns = (returns - returns.mean()) / (returns.std() + kEps);

        auto logprobs = torch::cat(mLogProbs);

        mOptimizer->zero_grad();
        auto policy_loss = -(logprobs * returns).sum();
        policy_loss.backward();
        mOptimizer->step();

        mLogProbs.clear();
        mSize = 0;
        ++mCount;
        spdlog::debug("{} : episode {}: loss = {}, accumulated reward = {}",
                __func__, mCount, policy_loss.item<float>(), mRewards.sum().item<float>());
    }

    std::shared_ptr<Policy> mPolicy;

    torch::Tensor mRewards;
    std::vector<torch::Tensor> mLogProbs;
    torch::Tensor mReturns;
    int32_t mSize{0};
    int32_t mCapacity{kExpBufferCap};

    std::shared_ptr<torch::optim::Adam> mOptimizer;
    uint32_t mCount{0};

    float mGamma{0.99};
    float mInitLR{1e-2};
};

Sample中的場景爲CartPole,場景比較簡單,因此其中的策略函數實現爲簡單的MLP。更爲複雜的場景可以替換爲複雜的網絡結構。

...
class Net : public nn::Module {
public:
    virtual std::tuple<Tensor, Tensor> forward(Tensor x) = 0;

    virtual ~Net() = default;
};

class MLP : public Net {
public:
    MLP(int64_t inputSize, int64_t actionNum) {
        mFC1 = register_module("fc1", nn::Linear(inputSize, mHiddenSize));
        mAction = register_module("action", nn::Linear(mHiddenSize, actionNum));
        mValue = register_module("value", nn::Linear(mHiddenSize, actionNum));
    }

    virtual std::tuple<Tensor, Tensor> forward(Tensor x) override {
        x = mFC1->forward(x);
        x = dropout(x, 0.6, is_training());
        x = relu(x);
        return std::make_tuple(mAction->forward(x), mValue->forward(x));
    }

private:
    nn::Linear mFC1{nullptr};
    nn::Linear mAction{nullptr};
    nn::Linear mValue{nullptr};
    int64_t mHiddenSize{128};
};

class Policy : public torch::nn::Module {
public:
    Policy(const SpaceDesc& ss, const SpaceDesc& as, torch::Device mDevice) 
        : mActionSpaceType(as.stype),
        mActionNum(as.shape[0]),
        mGen(kSeed),
        mUniformDist(0, 1.0) {

        if (ss.shape.size() == 1) {
            mNet = std::make_shared<MLP>(ss.shape[0], as.shape[0]);
        } else {
            mNet = std::make_shared<CNN>(ss.shape, as.shape[0]);
        }
        mNet->to(mDevice);
        register_module("base", mNet);

        torch::Tensor logits = torch::ones({1, as.shape[0]}, torch::TensorOptions(mDevice));
        mUniformCategorical = std::make_shared<Categorical>(nullptr, &logits);

    }

    torch::Tensor forward(torch::Tensor x) {
        x = std::get<0>(mNet->forward(x));
        return torch::softmax(x, 1);
    }

    std::tuple<torch::Tensor, torch::Tensor> act(torch::Tensor state) {
        auto output = forward(state);
        std::shared_ptr<Distribution> dist;
        if (!mActionSpaceType.compare("Discrete")) {
            dist = std::make_shared<Categorical>(&output);
        } else {
            throw std::logic_error("Not implemented : action space");
        }

        float rnd = mUniformDist(mGen);
        float threshold = kEpsEnd + (kEpsStart - kEpsEnd) * exp(-1. * mStep / kEpsDecay);
        ++mStep;
        torch::Tensor action;
        if (rnd > threshold) {
            torch::NoGradGuard no_grad;
            action = dist->sample();
        } else {
            torch::NoGradGuard no_grad;
            action = mUniformCategorical->sample({1}).squeeze(-1);
        }

        auto log_probs = dist->log_prob(action);
        return std::make_tuple(action, log_probs);
    }

private:
    std::string mActionSpaceType;
    int32_t mActionNum;
    int64_t mHiddenSize{128};
    std::shared_ptr<Net> mNet;
    uint64_t mStep{0};

    std::mt19937 mGen;
    std::uniform_real_distribution<float> mUniformDist;
    std::shared_ptr<Categorical> mUniformCategorical;
};

其中的Categorical類爲Categorical distribution相關計算,可以根據PyTorch中的python版本重寫成C++。

最後,將上面的C++實現編譯成so。根據實際情況在CMakeLists.txt中加入:

...
set(CMAKE_CXX_STANDARD 11)

find_package(Torch REQUIRED)

set(NRL_INCLUDE_DIRS
    src
    ${TORCH_INCLUDE_DIRS})

file(GLOB NRL_SOURCES1 "src/*.cpp")
list(APPEND NRL_SOURCES ${NRL_SOURCES1})
message(STATUS "sources: ${NRL_SOURCES}")

add_subdirectory(third_party/pybind11)
add_subdirectory(third_party/spdlog)

pybind11_add_module(nativerl ${NRL_SOURCES})
target_include_directories(nativerl PRIVATE ${NRL_INCLUDE_DIRS})
target_link_libraries(nativerl PRIVATE spdlog::spdlog ${TORCH_LIBRARIES})
...

假設編譯出的so位於build目錄下,python腳本爲example/simple.py。則可以通過命令開始訓練:

PYTHONPATH=./build python -m example.simple

正常的話可以看到類似的訓練過程log及結果,基本和python版本一致。

[2019-06-22 13:42:22.533] [info] state space type:Box shape size:1
[2019-06-22 13:42:22.534] [info] action space type:Discrete, shape size:1
[2019-06-22 13:42:22.534] [info] Training on GPU (CUDA)
Episode 0	 Last reward: 29.00	 step: 29	 Average reward: 10.95
Episode 10	 Last reward: 17.00	 step: 17	 Average reward: 14.73
Episode 20	 Last reward: 12.00	 step: 12	 Average reward: 17.40
Episode 30	 Last reward: 15.00	 step: 15	 Average reward: 24.47
Episode 40	 Last reward: 18.00	 step: 18	 Average reward: 26.22
Episode 50	 Last reward: 18.00	 step: 18	 Average reward: 23.69
Episode 60	 Last reward: 72.00	 step: 72	 Average reward: 30.21
Episode 70	 Last reward: 19.00	 step: 19	 Average reward: 28.83
Episode 80	 Last reward: 29.00	 step: 29	 Average reward: 32.13
Episode 90	 Last reward: 15.00	 step: 15	 Average reward: 29.64
Episode 100	 Last reward: 30.00	 step: 30	 Average reward: 27.88
Episode 110	 Last reward: 12.00	 step: 12	 Average reward: 26.14
Episode 120	 Last reward: 28.00	 step: 28	 Average reward: 26.32
Episode 130	 Last reward: 11.00	 step: 11	 Average reward: 31.20
Episode 140	 Last reward: 112.00	 step: 112	 Average reward: 35.26
Episode 150	 Last reward: 40.00	 step: 40	 Average reward: 37.14
Episode 160	 Last reward: 40.00	 step: 40	 Average reward: 36.84
Episode 170	 Last reward: 15.00	 step: 15	 Average reward: 41.91
Episode 180	 Last reward: 63.00	 step: 63	 Average reward: 49.78
Episode 190	 Last reward: 21.00	 step: 21	 Average reward: 44.70
Episode 200	 Last reward: 46.00	 step: 46	 Average reward: 41.83
Episode 210	 Last reward: 80.00	 step: 80	 Average reward: 51.55
Episode 220	 Last reward: 151.00	 step: 151	 Average reward: 57.82
Episode 230	 Last reward: 176.00	 step: 176	 Average reward: 62.80
Episode 240	 Last reward: 19.00	 step: 19	 Average reward: 63.17
Episode 250	 Last reward: 134.00	 step: 134	 Average reward: 74.02
Episode 260	 Last reward: 46.00	 step: 46	 Average reward: 71.35
Episode 270	 Last reward: 118.00	 step: 118	 Average reward: 85.88
Episode 280	 Last reward: 487.00	 step: 487	 Average reward: 112.74
Episode 290	 Last reward: 95.00	 step: 95	 Average reward: 139.41
Episode 300	 Last reward: 54.00	 step: 54	 Average reward: 149.20
Episode 310	 Last reward: 417.00	 step: 417	 Average reward: 138.42
Episode 320	 Last reward: 500.00	 step: 500	 Average reward: 179.29
Episode 330	 Last reward: 71.00	 step: 71	 Average reward: 195.88
Episode 340	 Last reward: 309.00	 step: 309	 Average reward: 216.82
Episode 350	 Last reward: 268.00	 step: 268	 Average reward: 214.21
Episode 360	 Last reward: 243.00	 step: 243	 Average reward: 210.89
Episode 370	 Last reward: 266.00	 step: 266	 Average reward: 200.03
Episode 380	 Last reward: 379.00	 step: 379	 Average reward: 220.06
Episode 390	 Last reward: 500.00	 step: 500	 Average reward: 316.20
Episode 400	 Last reward: 500.00	 step: 500	 Average reward: 369.46
Episode 410	 Last reward: 500.00	 step: 500	 Average reward: 421.84
Episode 420	 Last reward: 500.00	 step: 500	 Average reward: 453.20
Episode 430	 Last reward: 500.00	 step: 500	 Average reward: 471.98
Solved. Running reward: 475.9764491024681, Last reward: 500

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章