#pragma once

#include <folly/FBVector.h>
#include <folly/Hash.h>
#include <folly/executors/SerialExecutor.h>
#include <folly/futures/Future.h>
#include <gflags/gflags.h>
#include <wangle/channel/Handler.h>
#include <wangle/service/Service.h>

#include <atomic>
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/key_extractors.hpp>
#include <boost/multi_index_container.hpp>
#include <deque>
#include <functional>
#include <queue>

#include "common.hpp"
#include "minMaxWindow.hpp"
#include "reader.hpp"

namespace rtransfer {

// Handling acks and scheduled sends need to be prioritized over
// just adding new requests
constexpr std::int8_t SHAPER_OPS_PRIO = 20;

template <typename Clock>
class ShaperTimer;

/**
 * A constant specifying the minimum gain value that will allow the sending
 * rate to double each round (2/ln(2) ~= 2.89), used in Startup mode for
 * both pacingGain_ and cwndGain_.
 */
constexpr double BBRHighGain = 2.885;
/**
 * A constant specifying the length of the RTProp min filter window,
 * RTpropFilterLen is 10 secs.
 */
constexpr std::chrono::seconds RTpropFilterLen{10};
/**
 * A constant specifying the length of the BBR.BtlBw max filter window for
 * BBR.BtlBwFilter, BtlBwFilterLen is 10 packet-timed round trips.
 */
constexpr std::size_t BtlBWFilterLen{10};
/**
 * The minimal cwnd value BBR tries to target using: 4 packets, or 4 * SMSS
 */
constexpr std::size_t BBRMinPipeCwnd = 4 * 1500;
/**
 * The number of phases in the BBR ProbeBW gain cycle.
 */
constexpr std::size_t BBRGainCycleLen = 8;
/**
 * A constant specifying the minimum duration for which ProbeRTT state holds
 * inflight to BBRMinPipeCwnd or fewer packets.
 */
constexpr std::chrono::milliseconds ProbeRTTDuration{200};

/**
 * This is a *very* complicated and fragile class.
 * At it's core it's a faithful implementation of @link
 * https://tools.ietf.org/html/draft-cardwell-iccrg-bbr-congestion-control-00
 * and @link
 * https://tools.ietf.org/html/draft-cheng-iccrg-delivery-rate-estimation-00
 * The only devations from the spec is the "minimum bandwidth" which originally
 * is "4 packets", and since packets don't mean a thing on this level it's
 * estimated as 3750 bytes.
 * The send quantum of BBR algorithm is set to `FLAGS_shaper_quantum_ms_size`
 * times the estimated bandwidth - we're not in the kernel so we can't sleep for
 * very precise periods, thus we need to sleep long enough so that a few
 * milliseconds delay in waking our thread up won't kill the performance.
 */
class Shaper {
    using Clock = std::chrono::steady_clock;

    struct Request {
        explicit Request(MsgPtr msg);

        /**
         *  Instance of the original complete fetch message.
         */
        MsgPtr fetch;

        folly::Promise<std::size_t> promise;
        folly::fbvector<folly::Future<std::size_t>> readFutures;
        std::size_t retries;
    };

    class RequestQueue {
    public:
        explicit RequestQueue(std::size_t simultaneous);
        void push(std::shared_ptr<Request> req);
        bool empty() const;
        std::shared_ptr<Request> front();
        void pop();

    private:
        std::size_t simultaneous_;
        std::size_t idx_ = 0;
        std::deque<std::shared_ptr<Request>> queue_;
    };

    struct Packet {
        std::size_t delivered{};
        Clock::time_point deliveredTime;
        Clock::time_point firstSentTime;
        std::size_t isAppLimited{};
        Clock::time_point sentTime;
        std::size_t size{};
    };

    enum class State { probeBW, probeRTT, startup, drain };

public:
    using ID = std::tuple<folly::fbstring /* source storage id */,
        folly::fbstring /* destination storage id */,
        std::uint8_t /* priority */>;

    static constexpr auto ID_SRC_ = 0U;
    static constexpr auto ID_DEST_ = 1U;
    static constexpr auto ID_PRIO_ = 2U;

    Shaper(Shaper::ID id, Reader &reader, ShaperTimer<Clock> &timer);

    /**
     * Executes doRead() over the attached serial executor.
     */
    folly::Future<std::size_t> read(MsgPtr msg);

    folly::Future<folly::Unit> cancel(MsgPtr msg);

    folly::Future<bool> ack(
        std::uint64_t reqId, folly::fbvector<std::uint64_t> offsets);

    void scheduledSendPacket();

private:
    folly::Future<std::size_t> doRead(MsgPtr msg);

    void doCancel(MsgPtr msg);

    void doAck(
        Clock::time_point now, std::uint64_t reqId, std::uint64_t offset);

    void enterStartup();

    void updateModelAndState(const Packet &packet, Clock::time_point now,
        std::chrono::microseconds rtt);

    void updateBtlBw(const Packet &packet);

    void updateRound(const Packet &packet);

    void updateRtProp(Clock::time_point now, std::chrono::microseconds rtt);

    void updateControlParameters(const Packet &packet);

    void setPacingRateWithGain(double pacingGain);

    void setPacingRate();

    void setSendQuantum();

    std::size_t inFlight(double gain);

    void updateTargetCwnd();

    void saveCwnd();

    void restoreCwnd();

    void modulateCwndForProbeRTT();

    void setCwnd(const Packet &packet);

    void checkFullPipe();

    void enterDrain();

    void checkDrain(Clock::time_point now);

    void enterProbeBw(Clock::time_point now);

    void checkCyclePhase(Clock::time_point now);

    void advanceCyclePhase(Clock::time_point now);

    bool isNextCyclePhase(Clock::time_point now);

    void handleRestartFromIdle();

    void checkProbeRTT(Clock::time_point now);

    void enterProbeRTT();

    void handleProbeRTT(Clock::time_point now);

    void exitProbeRTT(Clock::time_point now);

    void onSendPacket(Clock::time_point now, std::uint64_t reqId,
        std::uint64_t offset, std::size_t size);

    bool generateRateSample(const Packet &p, Clock::time_point now);

    void updateRateSample(const Packet &p, Clock::time_point now);

    void sendPacket(Clock::time_point now);

    void scheduleUpdate(Clock::time_point now);

    std::size_t sendPacket(Clock::time_point now, std::size_t toSend);

    void enqueueFetchSubRequestWithRetry(std::shared_ptr<Request> req,
        std::unique_ptr<proto::LinkMessage> &&subRequestFetchMessage,
        std::size_t size);

    void fulfillWholeRequest(std::shared_ptr<Request> req);

    void fulfillSubRequest(std::shared_ptr<Request> req, std::size_t size);

    folly::Future<std::size_t> readFromStorage(MsgPtr req);

    std::size_t increaseInFlightSize(std::size_t size);

    std::size_t decreaseInFlightSize(std::size_t size);

    ID id_;

    RequestQueue pendingRequests_{1000};
    std::size_t pendingReqIdx_ = 0;

    std::unordered_map<std::uint64_t, std::shared_ptr<Request>> reqs_;

    Reader &reader_;
    folly::Executor::KeepAlive<folly::SerialExecutor> executor_;

    // https://tools.ietf.org/html/draft-cheng-iccrg-delivery-rate-estimation-00
    using ReqidOffset = std::pair<std::uint64_t, std::uint64_t>;
    std::unordered_map<ReqidOffset, Packet> sentPackets_;
    std::unordered_multimap<std::uint64_t, std::uint64_t> unackedPerReq_;
    std::size_t delivered_ = 0;
    Clock::time_point deliveredTime_;
    Clock::time_point firstSentTime_;
    std::size_t appLimited_ = 0;

    struct RateSample {
        double deliveryRate = 0.0;
        std::size_t isAppLimited = 0;
        folly::Optional<std::chrono::milliseconds> interval{
            std::chrono::milliseconds{0}};
        std::size_t delivered = 0;
        std::size_t priorDelivered = 0;
        Clock::time_point priorTime;
        std::chrono::milliseconds sendElapsed{0};
        std::chrono::milliseconds ackElapsed{0};
    } rs_;

    /**
     * https://tools.ietf.org/html/draft-cardwell-iccrg-bbr-congestion-control-00#section-5
     *
     * Number of bytes sent to received but not yet acknowledged.
     */
    std::size_t inFlight_{0};  // aka pipe
    mutable std::mutex inFlightMutex_;
    std::size_t pendingReqsSize_ = 0;  // aka writeSeq - SND.NXT

    /**
     * The transport sender's congestion window, which limits the amount of data
     * in flight.
     */
    std::size_t cwnd_ = BBRMinPipeCwnd;
    /**
     * BBR's estimated bottleneck bandwidth available to the transport flow,
     * estimated from the maximum delivery rate sample in a sliding window.
     */
    double btlBw_ = 0.0;
    /**
     * The max filter used to estimate btlBw_.
     */
    MinMaxWindow<double, std::greater_equal<>, std::size_t, std::size_t>
        btlBwFilter_{BtlBWFilterLen};
    /**
     * BBR's estimated two-way round-trip propagation delay of the path,
     * estimated from the windowed minimum recent round-trip delay sample.
     */
    folly::Optional<std::chrono::microseconds> rtProp_;
    /**
     * A boolean recording whether the BBR.RTprop has expired and is due for a
     * refresh with an application idle period or a transition into ProbeRTT
     * state.
     */
    bool rtPropExpired_ = false;
    /**
     * The wall clock time at which the current rtProp_ sample was obtained.
     */
    Clock::time_point rtPropStamp_;
    folly::Optional<Clock::time_point> probeRttDoneStamp_;
    bool probeRttRoundDone_ = false;
    std::size_t priorCwnd_ = 0;
    bool idleRestart_ = false;

    State state_ = State::startup;
    /**
     * The dynamic gain factor used to scale btlBw_ to produce pacingRate_.
     */
    double pacingGain_ = BBRHighGain;
    /**
     * The dynamic gain factor used to scale the estimated BDP to produce a
     * congestion window (cwnd).
     */
    double cwndGain_ = BBRHighGain;

    /**
     * packet.delivered value denoting the end of a packet-timed round trip.
     */
    std::size_t nextRoundDelivered_ = 0;
    /**
     * A boolean that BBR sets to true once per packet- timed round trip, on
     * ACKs that advance roundCount_.
     */
    bool roundStart_ = false;
    /**
     *  Count of packet-timed round trips.
     */
    std::size_t roundCount_ = 0;

    /**
     * A boolean that records whether BBR estimates that it has ever fully
     * utilized its available bandwidth ("filled the pipe").
     */
    bool filledPipe_ = false;
    double fullBw_ = 0;
    std::size_t fullBwCount_ = 0;

    /**
     * The current pacing rate for a BBR flow, which controls inter-packet
     * spacing.
     */
    double pacingRate_;

    std::size_t sendQuantum_{1500};
    std::size_t targetCwnd_{0};
    Clock::time_point cycleStamp_;
    std::size_t cycleIndex_{0};
    std::size_t priorInFlight_{0};

    bool isUpdateScheduled_{false};
    Clock::time_point nextSendTime_;
    Clock::time_point pseudoNextSendTime_;
    ShaperTimer<Clock> &timer_;
    size_t readRequestsCount_{0};
    size_t scheduledPacketsCount_{0};
    size_t sentPacketsCount_{0};
    size_t acknowledgedPacketsCount_{0};
};

using ShaperPtr = std::shared_ptr<Shaper>;

}  // namespace rtransfer
