// picobench v2.00
// https://github.com/iboB/picobench
//
// A micro microbenchmarking library in a single header file
//
// MIT License
//
// Copyright(c) 2017-2018 Borislav Stanimirov
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
//
//                  VERSION HISTORY
//
//  2.x1 (2021-01-19) * Added user data per iter, changed text output.
//  2.01 (2019-03-03) * Fixed android build when binding to a signle core
//  2.00 (2018-10-30) * Breaking change! runner::run_benchmarks doesn't return
//                      a report anymore. The report is generated by
//                      runner::generate_report instead
//                    * Breaking change! report_output_format doesn't accept
//                      output streams as arguments. Use set_output_streams.
//                    * Potentially breaking change (gcc and clang)! Always set
//                      thread affinity to first core. Macro to turn this off.
//                    * Added runner::run which performs a full execution
//                    * Added benchmark results and results comparison
//                    * Added error enum
//                    * Macro option to allow a std::function as a benchmark
//                    * Macros for default iterations and samples
//                    * Allowing local registration of benchmarks in a runner
//                    * Added local_runner which doesn't consume registry
//                    * More force-inline functions in states
//                    * Fixed some potential compilation warnings
//                    * Removed tests from header
//                    * Anonymous namespace for impl-only classes and funcs
//                    * Added setters and getters for every config option
//  1.05 (2018-07-17) * Counting iterations of state
//                    * Optionally set thread affinity when running benchmarks
//                      so as not to miss cpu cycles with the high res clock
//  1.04 (2018-02-06) * User data for benchmarks, which can be seen from states
//                    * `add_custom_duration` to states so the user can modify time
//                    * Text table format fixes
//                    * Custom cmd opts in runner
//                    * --version CLI command
//  1.03 (2018-01-05) Added helper methods for easier browsing of reports
//  1.02 (2018-01-04) Added parsing of command line
//  1.01 (2018-01-03) * Only taking the fastest sample into account
//                    * Set default number of samples to 2
//                    * Added CSV output
//  1.00 (2018-01-01) Initial release
//  0.01 (2017-12-28) Initial prototype release
//
//
//                  EXAMPLE
//
// void my_function(); // the function you want to benchmark
//
// // write your benchmarking code in a function like this
// static void benchmark_my_function(picobench::state& state)
// {
//     // use the state in a range-based for loop to call your code
//     for (auto _ : state)
//         my_function();
// }
// // create a picobench with your benchmarking code
// PICOBENCH(benchmark_my_function);
//
//
//                  BASIC DOCUMENTATION
//
// A very brief usage guide follows. For more detailed documentation see the
// README here: https://github.com/iboB/picobench/blob/master/README.md
//
// Simply include this file wherever you need.
// You need to define PICOBENCH_IMPLEMENT_WITH_MAIN (or PICOBENCH_IMPLEMENT if
// you want to write your own main function) in one compilation unit to have
// the implementation compiled there.
//
// The benchmark code must be a `void (picobench::state&)` function which
// you have written. Benchmarks are registered using the `PICOBENCH` macro
// where the only argument is the function's name.
//
// You can have multiple benchmarks in multiple files. All will be run when the
// executable starts.
//
// Typically a benchmark has a loop. To run the loop use the state argument in
// a range-based for loop in your function. The time spent looping is measured
// for the benchmark. You can have initialization/deinitialization code outside
// of the loop and it won't be measured.
//
#pragma once

#include <cstdint>
#include <chrono>
#include <vector>

#if defined(PICOBENCH_STD_FUNCTION_BENCHMARKS)
#   include <functional>
#endif

#define PICOBENCH_VERSION 2.x1
#define PICOBENCH_VERSION_STR "2.x1"

#if defined(PICOBENCH_DEBUG)
#   include <cassert>
#   define I_PICOBENCH_ASSERT assert
#else
#   define I_PICOBENCH_ASSERT(...)
#endif

#if defined(__GNUC__)
#   define PICOBENCH_INLINE __attribute__((always_inline))
#elif defined(_MSC_VER)
#   define PICOBENCH_INLINE  __forceinline
#else
#   define PICOBENCH_INLINE  inline
#endif

namespace picobench
{

#if defined(_MSC_VER) || defined(__MINGW32__) || defined(PICOBENCH_TEST)
struct high_res_clock
{
    typedef long long rep;
    typedef std::nano period;
    typedef std::chrono::duration<rep, period> duration;
    typedef std::chrono::time_point<high_res_clock> time_point;
    static const bool is_steady = true;

    static time_point now();
};
#else
using high_res_clock = std::chrono::high_resolution_clock;
#endif

using result_t = intptr_t;
using udata_t = uintptr_t;

class state
{
public:
    explicit state(size_t num_iterations, udata_t user_data = 0, udata_t arg = 0)
        : _user_data(user_data)
        , _arg(arg)
        , _iterations(num_iterations)
    {
        I_PICOBENCH_ASSERT(_iterations > 0);
    }

    size_t iterations() const { return _iterations; }

    uint64_t duration_ns() const { return _duration_ns; }
    void add_custom_duration(uint64_t duration_ns) { _duration_ns += duration_ns; }

    udata_t user_data() const { return _user_data; }
    udata_t arg() const { return _arg; }

    // optionally set result of benchmark
    // this can be used as a value sync to prevent optimizations
    // or a way to check whether benchmarks produce the same results
    void set_result(uintptr_t data) { _result = data; }
    result_t result() const { return _result; }

    PICOBENCH_INLINE
    void start_timer()
    {
        _start = high_res_clock::now();
    }

    PICOBENCH_INLINE
    void stop_timer()
    {
        auto duration = high_res_clock::now() - _start;
        _duration_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
    }

    struct iterator
    {
        PICOBENCH_INLINE
        iterator(state* parent)
            : _counter(0)
            , _lim(parent->iterations())
            , _state(parent)
        {
            I_PICOBENCH_ASSERT(_counter < _lim);
        }

        PICOBENCH_INLINE
        iterator()
            : _counter(0)
            , _lim(0)
            , _state(nullptr)
        {}

        PICOBENCH_INLINE
        iterator& operator++()
        {
            I_PICOBENCH_ASSERT(_counter < _lim);
            ++_counter;
            return *this;
        }

        PICOBENCH_INLINE
        bool operator!=(const iterator&) const
        {
            if (_counter < _lim) return true;
            _state->stop_timer();
            return false;
        }

        PICOBENCH_INLINE
        size_t operator*() const
        {
            return _counter;
        }

    private:
        size_t _counter;
        const size_t _lim;
        state* _state;
    };

    PICOBENCH_INLINE
    iterator begin()
    {
        start_timer();
        return iterator(this);
    }

    PICOBENCH_INLINE
    iterator end()
    {
        return iterator();
    }

private:
    high_res_clock::time_point _start;
    uint64_t _duration_ns = 0;
    udata_t _user_data;
    udata_t _arg;
    size_t _iterations;
    result_t _result = 0;
};

// this can be used for manual measurement
class scope
{
public:
    PICOBENCH_INLINE
    scope(state& s)
        : _state(s)
    {
        _state.start_timer();
    }

    PICOBENCH_INLINE
    ~scope()
    {
        _state.stop_timer();
    }
private:
    state& _state;
};

#if defined(PICOBENCH_STD_FUNCTION_BENCHMARKS)
using benchmark_proc = std::function<void(state&)>;
#else
using benchmark_proc = void(*)(state&);
#endif

class benchmark
{
public:
    const char* name() const { return _name; }

    benchmark& iterations(std::vector<size_t> data) { _state_iterations = std::move(data); return *this; }
    benchmark& samples(int n) { _samples = n; return *this; }
    benchmark& label(const char* label) { _name = label; return *this; }
    benchmark& baseline(bool b = true) { _baseline = b; return *this; }
    benchmark& user_data(udata_t data) { _user_data = data; return *this; }
    benchmark& args(std::vector<udata_t> data) { _args = std::move(data); return *this; }

protected:
    friend class runner;

    benchmark(const char* name, benchmark_proc proc);

    const char* _name;
    const benchmark_proc _proc;
    bool _baseline = false;

    udata_t _user_data = 0;
    std::vector<size_t> _state_iterations;
    std::vector<udata_t> _args;
    int _samples = 0;
};

// used for globally  functions
// note that you can instantiate a runner and register local benchmarks for it alone
class global_registry
{
public:
    static int set_bench_suite(const char* name);
    static benchmark& new_benchmark(const char* name, benchmark_proc proc);
};

}

#define I_PICOBENCH_PP_CAT(a, b) I_PICOBENCH_PP_INTERNAL_CAT(a, b)
#define I_PICOBENCH_PP_INTERNAL_CAT(a, b) a##b

#define PICOBENCH_SUITE(name) \
    static int I_PICOBENCH_PP_CAT(picobench_suite, __LINE__) = \
    picobench::global_registry::set_bench_suite(name)

#define PICOBENCH(func) \
    static auto& I_PICOBENCH_PP_CAT(picobench, __LINE__) = \
    picobench::global_registry::new_benchmark(#func, func)

#if defined(PICOBENCH_IMPLEMENT_WITH_MAIN)
#   define PICOBENCH_IMPLEMENT
#   define PICOBENCH_IMPLEMENT_MAIN
#endif

#if defined(PICOBENCH_IMPLEMENT)

#include <random>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <map>
#include <memory>
#include <cstring>
#include <cstdlib>

#if defined(_WIN32)
#   define WIN32_LEAN_AND_MEAN
#   include <Windows.h>
#else
#   if !defined(PICOBENCH_DONT_BIND_TO_ONE_CORE)
#       if defined(__APPLE__)
#           include <mach/mach.h>
#       else
#           include <sched.h>
#       endif
#   endif
#endif

namespace picobench
{

// namespace
// {

enum error_t
{
    no_error,
    error_bad_cmd_line_argument, // ill-formed command-line argument
    error_unknown_cmd_line_argument, // command argument looks like a picobench one, but isn't
    error_sample_compare, // benchmark produced different results across samples
    error_benchmark_compare, // two benchmarks of the same suite and dimension produced different results
};

class report
{
public:
    struct benchmark_problem_space
    {
        size_t dimension; // number of iterations for the problem space
        udata_t user_data;
        udata_t arg;
        int samples; // number of samples taken
        uint64_t total_time_ns; // fastest sample!!!
        result_t result; // result of fastest sample
    };
    struct benchmark
    {
        const char* name;
        bool is_baseline;
        std::vector<benchmark_problem_space> data;
    };

    struct suite
    {
        const char* name;
        std::vector<benchmark> benchmarks; // benchmark view

        const benchmark* find_benchmark(const char* name) const
        {
            for (auto& b : benchmarks)
            {
                if (strcmp(b.name, name) == 0)
                    return &b;
            }

            return nullptr;
        }

        const benchmark* find_baseline() const
        {
            for (auto& b : benchmarks)
            {
                if (b.is_baseline)
                    return &b;
            }

            return nullptr;
        }
    };

    std::vector<suite> suites;
    error_t error = no_error;

    const suite* find_suite(const char* name) const
    {
        for (auto& s : suites)
        {
            if (strcmp(s.name, name) == 0)
                return &s;
        }

        return nullptr;
    }

    void to_text(std::ostream& out) const
    {
        using namespace std;
        int width = 100;
        for (auto& suite : suites)
        {
            if (suite.name)
            {
                out << suite.name << ":\n";
            }
            line(out, width);
            out <<
                "  Name (* = baseline)        |Iterations |       Arg |Baseline |     ns/op |  Total ms |  Ops/second\n";
            line(out, width);

            auto problem_space_view = get_problem_space_view(suite);
            for (auto& ps : problem_space_view)
            {
                const problem_space_benchmark* baseline = nullptr;
                for (auto& bm : ps.second)
                {
                    if (bm.is_baseline)
                    {
                        baseline = &bm;
                        break;
                    }
                }

                for (auto& bm : ps.second)
                {
                    out << (bm.is_baseline ? "* " : "  ") << left << setw(26) << bm.name << right;

                    out << " |"
                        << setw(10) << ps.first.first << " |"
                        << setw(10) << bm.arg << " |";

                    if (bm.is_baseline) //(baseline == &bm)
                    {
                        baseline = &bm;
                        out << "       - |";
                    }
                    else if (baseline)
                    {
                        out << setw(8) << fixed << setprecision(3)
                            << double(bm.total_time_ns) / double(baseline->total_time_ns) << " |";
                    }
                    else
                    {
                        // no baseline to compare to
                        out << "       ? |";
                    }

                    auto ns_op = (bm.total_time_ns / ps.first.first);
                    if (ns_op > 99999999)
                    {
                        int e = 0;
                        while (ns_op > 999999)
                        {
                            ++e;
                            ns_op /= 10;
                        }
                        out << setw(8) << ns_op << 'e' << e;
                    }
                    else
                    {
                        out << setw(10) << ns_op;
                    }
                    out << " |";
                    out << setw(10) << fixed << setprecision(2) << double(bm.total_time_ns) / 1000000.0 << " |";

                    auto ops_per_sec = ps.first.first * (1000000000.0 / double(bm.total_time_ns));
                    out << setw(12) << fixed << setprecision(1) << ops_per_sec << "\n";
                }
            }
            line(out, width);
        }
    }

    void to_text_concise(std::ostream& out)
    {
        using namespace std;
        int width = 65;
        for (auto& suite : suites)
        {
            if (suite.name)
            {
                out << suite.name << ":\n";
            }

            line(out, width);

            out <<
                "  Name (* = baseline)        | Baseline |     ns/op |    Total ms\n"; // |  Ops/second\n";

            line(out, width);

            const benchmark* baseline = nullptr;
            for (auto& bm : suite.benchmarks)
            {
                if (bm.is_baseline)
                {
                    baseline = &bm;
                    break;
                }
            }
            I_PICOBENCH_ASSERT(baseline);
            uint64_t baseline_total_time = 0;
            size_t baseline_total_iterations = 0;
            for (auto& d : baseline->data)
            {
                baseline_total_time += d.total_time_ns;
                baseline_total_iterations += d.dimension;
            }

            for (auto& bm : suite.benchmarks)
            {
                out << (bm.is_baseline ? "* " : "  ") << left << setw(26) << bm.name << right
                    << " |";

                uint64_t total_time = 0;
                size_t total_iterations = 0;
                for (auto& d : bm.data)
                {
                    total_time += d.total_time_ns;
                    total_iterations += d.dimension;
                }
                uint64_t ns_per_op = total_time / total_iterations;

                if (bm.is_baseline)
                {
                    out << "        -";
                    baseline = &bm;
                    baseline_total_time = total_time;
                    baseline_total_iterations = total_iterations;
                }
                else
                {
                    out << setw(9) << fixed << setprecision(3)
                        << double(total_time) / baseline_total_time;
                }

                out << " |" << setw(10) << ns_per_op << " |";
                out << setw(12) << fixed << setprecision(2) << double(total_time) / 1000000.0 << "\n";

                //auto ops_per_sec = total_iterations * (1000000000.0 / total_time);
                //out << setw(12) << fixed << setprecision(1) << ops_per_sec << "\n";
            }

            line(out, width);
        }
    }

    void to_csv(std::ostream& out) const
    {
        using namespace std;
        const char* sep = ",";

        for (auto& suite : suites)
        {
            out << "Suite, Baseline, Benchmark, Iterations, Arg, Ratio, Total ms, ns/op, Ops/second\n";

            auto problem_space_view = get_problem_space_view(suite);
            for (auto& ps : problem_space_view)
            {
                const problem_space_benchmark* baseline = nullptr;
                for (auto& bm : ps.second)
                {
                    if (bm.is_baseline)
                    {
                        baseline = &bm;
                        break;
                    }
                }

                for (auto& bm : ps.second)
                {
                    out << '"' << (suite.name ? suite.name : "") << '"';
                    out << sep << (bm.is_baseline ? "true" : "false");
                    out << sep << '"' << bm.name << '"';
                    out << sep << ps.first.first
                        << sep << bm.arg << sep;

                    if (baseline == &bm)
                    {
                        out << 1.0;
                    }
                    else if (baseline)
                    {
                        out << fixed << setprecision(3) << double(bm.total_time_ns) / baseline->total_time_ns;
                    }
                    else
                    {
                        out << -1.0; // no baseline to compare to
                    }

                    out << sep << fixed << setprecision(3) << bm.total_time_ns / 1000000.0;

                    auto ns_op = (bm.total_time_ns / ps.first.first);
                    out << sep << ns_op;

                    auto ops_per_sec = ps.first.first * (1000000000.0 / bm.total_time_ns);
                    out << sep << fixed << setprecision(1) << ops_per_sec << "\n";
                }
            }
        }
    }


    struct problem_space_benchmark
    {
        const char* name;
        bool is_baseline;
        udata_t user_data;
        udata_t arg;
        uint64_t total_time_ns; // fastest sample!!!
        result_t result; // result of fastest sample
    };
    using problem_space_view_map = std::map<std::pair<size_t, udata_t>,
                                            std::vector<problem_space_benchmark>>;
    static problem_space_view_map get_problem_space_view(const suite& s)
    {
        problem_space_view_map res;
        for (auto& bm : s.benchmarks)
        {
            for (auto& d : bm.data)
            {
                auto& pvbs = res[{d.dimension, d.arg}];
                pvbs.push_back({ bm.name, bm.is_baseline, d.user_data, d.arg, d.total_time_ns, d.result });
            }
        }
        return res;
    }

private:

    static void line(std::ostream& out, int width = 79)
    {
        for (int i = 0; i < width; ++i) out.put('=');
        out.put('\n');
    }
};

class benchmark_impl : public benchmark
{
public:
    benchmark_impl(const char* name, benchmark_proc proc)
        : benchmark(name, proc)
    {}

private:
    friend class runner;

    // state
    std::vector<state> _states; // length is _samples * _state_iterations.size()
    std::vector<state>::iterator _istate;
};

class picostring
{
public:
    picostring() = default;
    explicit picostring(const char* text)
    {
        str = text;
        len = int(strlen(text));
    }

    const char* str;
    int len = 0;

    // checks whether other begins with this string
    bool cmp(const char* other) const
    {
        return strncmp(str, other, size_t(len)) == 0;
    }
};

class null_streambuf : public std::streambuf
{
public:
    virtual int overflow(int c) override { return c; }
};

struct null_stream : public std::ostream
{
    null_stream() : std::ostream(&_buf) {}
private:
    null_streambuf _buf;
} cnull;

enum class report_output_format
{
    text,
    concise_text,
    csv,
    all,
};

#if !defined(PICOBENCH_DEFAULT_ITERATIONS)
#   define PICOBENCH_DEFAULT_ITERATIONS { 8, 64, 512, 4096, 8192 }
#endif

#if !defined(PICOBENCH_DEFAULT_SAMPLES)
#   define PICOBENCH_DEFAULT_SAMPLES 2
#endif

using benchmarks_vector = std::vector<std::unique_ptr<benchmark_impl>>;
struct rsuite
{
    const char* name;
    benchmarks_vector benchmarks;
};

class registry
{
public:
    benchmark& add_benchmark(const char* name, benchmark_proc proc)
    {
        auto b = new benchmark_impl(name, proc);
        benchmarks_for_current_suite().emplace_back(b);
        return *b;
    }

    void set_suite(const char* name)
    {
        _current_suite_name = name;
    }

    const char*& current_suite_name()
    {
        return _current_suite_name;
    }

    benchmarks_vector& benchmarks_for_current_suite()
    {
        for (auto& s : _suites)
        {
            if (s.name == _current_suite_name)
                return s.benchmarks;

            if (s.name && _current_suite_name && strcmp(s.name, _current_suite_name) == 0)
                return s.benchmarks;
        }
        _suites.push_back({ _current_suite_name, {} });
        return _suites.back().benchmarks;
    }

protected:
    friend class runner;
    const char* _current_suite_name = nullptr;
    std::vector<rsuite> _suites;
};

registry& g_registry()
{
    static registry r;
    return r;
}

class runner : public registry
{
public:
    runner(bool local = false)
        : _default_state_iterations(PICOBENCH_DEFAULT_ITERATIONS)
        , _default_samples(PICOBENCH_DEFAULT_SAMPLES)
    {
        if (!local)
        {
            _suites = std::move(g_registry()._suites);
        }
    }

    int run(int benchmark_random_seed = -1)
    {
        if (should_run())
        {
            run_benchmarks(benchmark_random_seed);
            auto report = generate_report();
            std::ostream* out = _stdout;
            std::ofstream fout;
            report_output_format fmt[] = {report_output_format::csv,
                                          report_output_format::text,
                                          report_output_format::concise_text};
            const char *ext[] = {".csv", ".txt", ".lst"}, *fn = preferred_output_filename();
            bool all = preferred_output_format() == report_output_format::all;
            for (int i = 0; i < 3; ++i)
            {
                if (all || preferred_output_format() == fmt[i])
                {
                    if (fn)
                    {
                        std::string name(fn);

                        if (all || name.find(".") == std::string::npos)
                        {
                             name += ext[i];
                        }
                        fout.close();
                        fout.open(name.c_str());
                        if (!fout.is_open())
                        {
                            std::cerr << "Error: Could not open output file `" << fn << "`\n";
                            return 1;
                        }
                        out = &fout;
                    }

                    switch (fmt[i])
                    {
                    case report_output_format::text:
                        report.to_text(*out);
                        break;
                    case report_output_format::concise_text:
                        report.to_text_concise(*out);
                        break;
                    case report_output_format::csv:
                        report.to_csv(*out);
                        break;
                    default: break;
                    }
                }
            }
        }
        return error();
    }

    void run_benchmarks(int random_seed = -1)
    {
        I_PICOBENCH_ASSERT(_error == no_error && _should_run);

        if (random_seed == -1)
        {
            random_seed = int(std::random_device()());
        }

        std::minstd_rand rnd(random_seed);

        // vector of all benchmarks
        std::vector<benchmark_impl*> benchmarks;
        for (auto& suite : _suites)
        {
            // also identify a baseline in this loop
            // if there is no explicit one, set the first one as a baseline
            bool found_baseline = false;
            for (auto irb = suite.benchmarks.begin(); irb != suite.benchmarks.end(); ++irb)
            {
                auto& rb = *irb;
                rb->_states.clear(); // clear states so we can safely call run_benchmarks multiple times
                benchmarks.push_back(rb.get());
                if (rb->_baseline)
                {
                    found_baseline = true;
                }

#if !defined(PICOBENCH_STD_FUNCTION_BENCHMARKS)
                // check for same func
                for (auto ib = irb+1; ib != suite.benchmarks.end(); ++ib)
                {
                    auto& b = *ib;
                    if (rb->_proc == b->_proc)
                    {
                        *_stdwarn << "Warning: " << rb->name() << " and " << b->name()
                                 << " are benchmarks of the same function.\n";
                    }
                }
#endif
            }

            if (!found_baseline && !suite.benchmarks.empty())
            {
                suite.benchmarks.front()->_baseline = true;
            }
        }

        // initialize benchmarks
        for (auto b : benchmarks)
        {
            if (b->_state_iterations.empty())
                b->_state_iterations = _default_state_iterations;

            udata_t arg = b->_args.empty() ? udata_t() : b->_args.back();
            b->_args.resize(b->_state_iterations.size(), arg);

            if (b->_samples == 0)
                b->_samples = _default_samples;

            b->_states.reserve(b->_state_iterations.size() * b->_samples);

            // fill states while random shuffling them
            for (size_t iter = 0; iter < b->_state_iterations.size(); ++iter)
            {
                for (int i = 0; i < b->_samples; ++i)
                {
                    auto index = rnd() % (b->_states.size() + 1);
                    auto pos = b->_states.begin() + long(index);
                    b->_states.emplace(pos, b->_state_iterations[iter], b->_user_data, b->_args[iter]);
                }
            }

            b->_istate = b->_states.begin();
        }

#if !defined(PICOBENCH_DONT_BIND_TO_ONE_CORE)
        // set thread affinity to first cpu
        // so the high resolution clock doesn't miss cycles
        {
#if defined(_WIN32)
        SetThreadAffinityMask(GetCurrentThread(), 1);
#elif defined(__APPLE__)
        thread_affinity_policy_data_t policy = {0};
        thread_policy_set(
            pthread_mach_thread_np(pthread_self()),
            THREAD_AFFINITY_POLICY,
            (thread_policy_t)&policy, 1);
#else
        cpu_set_t cpuset;
        CPU_ZERO(&cpuset);
        CPU_SET(0, &cpuset);

        sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
#endif
        }
#endif

        // we run a random benchmark from it incrementing _istate for each
        // when _istate reaches _states.end(), we erase the benchmark
        // when the vector becomes empty, we're done
        while (!benchmarks.empty())
        {
            auto i = benchmarks.begin() + long(rnd() % benchmarks.size());
            auto& b = *i;
            std::cerr << '.';
            b->_proc(*b->_istate);

            ++b->_istate;

            if (b->_istate == b->_states.end())
            {
                benchmarks.erase(i);
            }
        }
        std::cerr << '\n';
    }

    // function to compare results
    template <typename CompareResult = std::equal_to<result_t>>
    report generate_report(CompareResult cmp = std::equal_to<result_t>()) const
    {
        report rpt;

        rpt.suites.resize(_suites.size());
        auto rpt_suite = rpt.suites.begin();

        for (auto& suite : _suites)
        {
            rpt_suite->name = suite.name;

            // build benchmark view
            rpt_suite->benchmarks.resize(suite.benchmarks.size());
            auto rpt_benchmark = rpt_suite->benchmarks.begin();

            for (auto& b : suite.benchmarks)
            {
                rpt_benchmark->name = b->_name;
                rpt_benchmark->is_baseline = b->_baseline;

                rpt_benchmark->data.reserve(b->_state_iterations.size());
                for (size_t i = 0; i < b->_state_iterations.size(); ++i)
                {
                    rpt_benchmark->data.push_back({ b->_state_iterations[i], b->_user_data, b->_args[i], 0, 0ll });
                }

                for (auto& state : b->_states)
                {
                    for (auto& d : rpt_benchmark->data)
                    {
                        if (state.iterations() == d.dimension && state.arg() == d.arg)
                        {
                            if (d.total_time_ns == 0 || d.total_time_ns > state.duration_ns())
                            {
                                d.total_time_ns = state.duration_ns();
                                d.result = state.result();
                            }

                            if (_compare_results_across_samples)
                            {
                                if (d.result != state.result() && !cmp(d.result, state.result()))
                                {
                                    *_stderr << "Error: Two samples of " << b->name() << " @" << d.dimension << " produced different results: "
                                             << d.result << " and " << state.result() << '\n';
                                    _error = error_sample_compare;
                                }
                            }

                            ++d.samples;
                        }
                    }
                }

#if defined(PICOBENCH_DEBUG)
                for (auto& d : rpt_benchmark->data)
                {
                    I_PICOBENCH_ASSERT(d.samples == b->_samples);
                }
#endif

                ++rpt_benchmark;
            }

            ++rpt_suite;
        }

        if (_compare_results_across_benchmarks)
        {
            for(auto& suite : rpt.suites)
            {
                auto psview = report::get_problem_space_view(suite);

                for (auto& space : psview)
                {
                    I_PICOBENCH_ASSERT(!space.second.empty());

                    if (space.second.size() == 1)
                    {
                        auto& b = space.second.front();
                        *_stdwarn << "Warning: Benchmark " << b.name << " @" << space.first.first
                                  << " has a single instance and cannot be compared to others.\n";
                        continue;
                    }

                    auto result0 = space.second.front().result;

                    for (auto& b : space.second)
                    {
                        if (result0 != b.result && !cmp(result0, b.result))
                        {
                            auto& f = space.second.front();
                            *_stderr << "Error: Benchmarks " << f.name << " and " << b.name
                                     << " @" << space.first.first << " produce different results: "
                                     << result0 << " and " << b.result << '\n';
                            _error = error_benchmark_compare;
                        }
                    }
                }
            }
        }

        return rpt;
    }

    void set_default_state_iterations(const std::vector<size_t>& data)
    {
        _default_state_iterations = data;
    }

    const std::vector<size_t>& default_state_iterations() const
    {
        return _default_state_iterations;
    }

    void set_default_samples(int n)
    {
        _default_samples = n;
    }

    int default_samples() const
    {
        return _default_samples;
    }

    void add_cmd_opt(const char* cmd, const char* arg_desc, const char* cmd_desc, bool(*handler)(uintptr_t, const char*), udata_t user_data = 0)
    {
        cmd_line_option opt;
        opt.cmd = picostring(cmd);
        opt.arg_desc = picostring(arg_desc);
        opt.desc = cmd_desc;
        opt.handler = nullptr;
        opt.user_data = user_data;
        opt.user_handler = handler;
        _opts.push_back(opt);
    }

    // returns false if there were errors parsing the command line
    // all args starting with prefix are parsed
    // the others are ignored
    bool parse_cmd_line(int argc, const char* const argv[], const char* cmd_prefix = "-")
    {
        _cmd_prefix = picostring(cmd_prefix);

        if (!_has_opts)
        {
            _opts.emplace_back("-iters=", "<n1,n2,n3,...>",
                "Sets default iterations for benchmarks",
                &runner::cmd_iters);
            _opts.emplace_back("-samples=", "<n>",
                "Sets default number of samples for benchmarks",
                &runner::cmd_samples);
            _opts.emplace_back("-out-fmt=", "<txt|con|csv>",
                "Outputs text, concise, csv or all",
                &runner::cmd_out_fmt);
            _opts.emplace_back("-all", "",
                "Outputs all formats: text, con, csv",
                &runner::cmd_out_all);
            _opts.emplace_back("-output=", "<filename>",
                "Sets output filename or `stdout`",
                &runner::cmd_output);
            _opts.emplace_back("-no-compare-results", "",
                "Doesn't compare benchmark results",
                &runner::cmd_compare_results);
            _opts.emplace_back("-no-run", "",
                "Doesn't run benchmarks",
                &runner::cmd_no_run);
            _opts.emplace_back("-version", "",
                "Show version info",
                &runner::cmd_version);
            _opts.emplace_back("-help", "",
                "Prints help",
                &runner::cmd_help);
            _has_opts = true;
        }

        for (int i = 1; i < argc; ++i)
        {
            if (!_cmd_prefix.cmp(argv[i]))
                continue;

            auto arg = argv[i] + _cmd_prefix.len;

            bool found = false;
            for (auto& opt : _opts)
            {
                if (opt.cmd.cmp(arg))
                {
                    found = true;
                    bool success = false;
                    if (opt.handler)
                    {
                        success = (this->*opt.handler)(arg + opt.cmd.len);
                    }
                    else
                    {
                        I_PICOBENCH_ASSERT(opt.user_handler);
                        success = opt.user_handler(opt.user_data, arg + opt.cmd.len);
                    }

                    if (!success)
                    {
                        *_stderr << "Error: Bad command-line argument: " << argv[i] << "\n";
                        _error = error_bad_cmd_line_argument;
                        return false;
                    }
                    break;
                }
            }

            if (!found)
            {
                *_stderr << "Error: Unknown command-line argument: " << argv[i] << "\n";
                _error = error_unknown_cmd_line_argument;
                return false;
            }
        }

        return true;
    }

    void set_should_run(bool set) { _should_run = set; }
    bool should_run() const { return _error == no_error && _should_run; }
    void set_error(error_t e) { _error = e; }
    error_t error() const { return _error; }

    void set_output_streams(std::ostream& out, std::ostream& err)
    {
        _stdout = &out;
        _stderr = &err;
        _stdwarn = &out;
    }

    void set_preferred_output_format(report_output_format fmt) { _output_format = fmt; }
    report_output_format preferred_output_format() const { return _output_format; }

    // can be nullptr (run will interpret it as stdout)
    void set_preferred_output_filename(const char* path) { _output_file = path; }
    const char* preferred_output_filename() const { return _output_file; }

    void set_compare_results_across_samples(bool b) { _compare_results_across_samples = b; }
    bool compare_results_across_samples() const { return _compare_results_across_samples; }

    void set_compare_results_across_benchmarks(bool b) { _compare_results_across_benchmarks = b; }
    bool compare_results_across_benchmarks() const { return _compare_results_across_benchmarks; }

private:
    // runner's suites and benchmarks come from its parent: registry

    // state and configuration
    mutable error_t _error = no_error;
    bool _should_run = true;

    bool _compare_results_across_samples = true;
    bool _compare_results_across_benchmarks = true;

    report_output_format _output_format = report_output_format::concise_text;
    const char* _output_file = nullptr; // nullptr means stdout

    std::ostream* _stdout = &std::cout;
    std::ostream* _stderr = &std::cerr;
    std::ostream* _stdwarn = &std::cout;

    // default data

    // default iterations per state per benchmark
    std::vector<size_t> _default_state_iterations;

    // default samples per benchmark
    int _default_samples;

    // command line parsing
    picostring _cmd_prefix;
    typedef bool (runner::*cmd_handler)(const char*); // internal handler
    typedef bool(*ext_handler)(udata_t user_data, const char* cmd_line); // external (user) handler
    struct cmd_line_option
    {
        cmd_line_option() = default;
        cmd_line_option(const char* c, const char* a, const char* d, cmd_handler h)
            : cmd(c)
            , arg_desc(a)
            , desc(d)
            , handler(h)
            , user_data(0)
            , user_handler(nullptr)
        {}
        picostring cmd;
        picostring arg_desc;
        const char* desc;
        cmd_handler handler; // may be nullptr for external handlers
        udata_t user_data; // passed as an argument to user handlers
        ext_handler user_handler;
    };
    bool _has_opts = false; // have opts been added to list
    std::vector<cmd_line_option> _opts;

    bool cmd_iters(const char* line)
    {
        std::vector<size_t> iters;
        auto p = line;
        while (true)
        {
            auto i = strtoull(p, nullptr, 10);
            if (i <= 0) return false;
            iters.push_back(i);
            p = strchr(p + 1, ',');
            if (!p) break;
            ++p;
        }
        if (iters.empty()) return false;
        _default_state_iterations = iters;
        return true;
    }

    bool cmd_samples(const char* line)
    {
        int samples = int(strtol(line, nullptr, 10));
        if (samples <= 0) return false;
        _default_samples = samples;
        return true;
    }

    bool cmd_no_run(const char* line)
    {
        if (*line) return false;
        _should_run = false;
        return true;
    }

    bool cmd_version(const char* line)
    {
        if (*line) return false;
        *_stdout << "picobench " PICOBENCH_VERSION_STR << "\n";
        _should_run = false;
        return true;
    }

    bool cmd_help(const char* line)
    {
        if (*line) return false;
        cmd_version(line);
        auto& cout = *_stdout;
        for (auto& opt : _opts)
        {
            cout << ' ' << _cmd_prefix.str << opt.cmd.str << opt.arg_desc.str;
            int w = 27 - (_cmd_prefix.len + opt.cmd.len + opt.arg_desc.len);
            for (int i = 0; i < w; ++i)
            {
                cout.put(' ');
            }
            cout << opt.desc << "\n";
        }
        _should_run = false;
        return true;
    }

    bool cmd_out_all(const char* line)
    {
        _output_format = report_output_format::all;
        return true;
    }
    bool cmd_out_fmt(const char* line)
    {
        if (strcmp(line, "txt") == 0)
        {
            _output_format = report_output_format::text;
        }
        else if (strcmp(line, "con") == 0)
        {
            _output_format = report_output_format::concise_text;
        }
        else if (strcmp(line, "csv") == 0)
        {
            _output_format = report_output_format::csv;
        }
        else
        {
            return false;
        }
        return true;
    }

    bool cmd_output(const char* line)
    {
        if (strcmp(line, "stdout") != 0)
        {
            _output_file = line;
        }
        else
        {
            _output_file = nullptr;
        }
        return true;
    }

    bool cmd_compare_results(const char* line)
    {
        if (*line) return false;
        _compare_results_across_samples = false;
        _compare_results_across_benchmarks = false;
        return true;
    }
};

class local_runner : public runner
{
public:
    local_runner() : runner(true)
    {}
};

// } // anonymous namespace

benchmark::benchmark(const char* name, benchmark_proc proc)
    : _name(name)
    , _proc(proc)
{}

benchmark& global_registry::new_benchmark(const char* name, benchmark_proc proc)
{
    return g_registry().add_benchmark(name, proc);
}

int global_registry::set_bench_suite(const char* name)
{
    g_registry().current_suite_name() = name;
    return 0;
}

#if (defined(_MSC_VER) || defined(__MINGW32__)) && !defined(PICOBENCH_TEST)

static const long long high_res_clock_freq = []() -> long long
{
    LARGE_INTEGER frequency;
    QueryPerformanceFrequency(&frequency);
    return frequency.QuadPart;
}();

high_res_clock::time_point high_res_clock::now()
{
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return time_point(duration((t.QuadPart * rep(period::den)) / high_res_clock_freq));
}
#endif
}

#endif

#if defined(PICOBENCH_IMPLEMENT_MAIN)
int main(int argc, char* argv[])
{
    picobench::runner r;
    r.parse_cmd_line(argc, argv);
    return r.run();
}
#endif

#if defined(PICOBENCH_TEST)

// fake time keeping functions for the tests
namespace picobench
{

void this_thread_sleep_for_ns(uint64_t ns);

template <class Rep, class Period>
void this_thread_sleep_for(const std::chrono::duration<Rep, Period>& duration)
{
    this_thread_sleep_for_ns(std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count());
}

#if defined(PICOBENCH_IMPLEMENT)
static struct fake_time
{
    uint64_t now;
} the_time;

void this_thread_sleep_for_ns(uint64_t ns)
{
    the_time.now += ns;
}

high_res_clock::time_point high_res_clock::now()
{
    auto ret = time_point(duration(the_time.now));
    return ret;
}
#endif

}

#endif