#ifndef du6bitcount_hpp_
#define du6bitcount_hpp_

#include <nmmintrin.h>

#include <vector>
#include <array>

template< typename P>
class bitarray
{
private:
	using self_ = bitarray< P>;
public:
	bitarray( std::size_t s)
		: body_( s)
	{
	}

	std::size_t size() const
	{
		return body_.size();
	}

	void set(std::size_t i, bool v)
	{
		if (i >= body_.size())
			throw std::range_error("Index out of range in set");

		using carrier_t = std::uint32_t;
		static const unsigned carrier_bits = 8 * sizeof(carrier_t);
		auto dp = body_.template data<carrier_t>();
		auto & w = dp[i / carrier_bits];
		auto j = i % carrier_bits;
		w = w & ~(1ULL << j) | (carrier_t(v) << j);
	}

	bool get(std::size_t i) const
	{
		if (i >= body_.size())
			throw std::range_error("Index out of range in get");

		using carrier_t = std::uint32_t;
		static const unsigned carrier_bits = 8 * sizeof(carrier_t);
		auto dp = body_.template data<carrier_t>();
		auto & w = dp[i / carrier_bits];
		auto j = i % carrier_bits;
		return bool((w >> j) & 1ULL);
	}

	bool is_zero() const
	{
		return body_.is_zero();
	}

	void assign_zero()
	{
		body_.assign_zero();
	}

	void assign_not(const self_ & a)
	{
		if (body_.size() != a.body_.size())
			throw std::range_error("Size mismatch in assign_not");

		body_.assign_not(a.body_);
	}

	void assign_and(const self_ & a, const self_ & b)
	{
		if (body_.size() != a.body_.size() || body_.size() != b.body_.size())
			throw std::range_error("Size mismatch in assign_and");

		body_.assign_and(a.body_, b.body_);
	}

	void assign_or(const self_ & a, const self_ & b)
	{
		if (body_.size() != a.body_.size() || body_.size() != b.body_.size())
			throw std::range_error("Size mismatch in assign_or");

		body_.assign_or(a.body_, b.body_);
	}

	std::size_t count_ones() const
	{
		return body_.count_ones();
	}

private:
	using body_t = typename P::body_t;
	body_t body_;
};

class body64
{
private:
	using self_ = body64;
public:
	body64(std::size_t s)
		: simd_v_( (s + simd_carrier_bits - 1) / simd_carrier_bits, simd_zero_), 
		unused_bits_(simd_carrier_bits - 1 - (s + simd_carrier_bits - 1) % simd_carrier_bits)
	{
	}

	std::size_t size() const
	{
		return simd_v_.size() * simd_carrier_bits - unused_bits_;
	}

	template< typename T>
	T * data()
	{
		return reinterpret_cast<T *>(simd_v_.data());
	}

	template< typename T>
	const T * data() const
	{
		return reinterpret_cast<const T *>(simd_v_.data());
	}

	bool is_zero() const
	{
		simd_carrier_t s = simd_zero_;
		for (std::size_t i = 0; i < simd_v_.size(); ++i)
		{
			s |= simd_v_[i];
		}
	}

	void assign_zero()
	{
		for (std::size_t i = 0; i < simd_v_.size(); ++i)
		{
			simd_v_[i] = simd_zero_;
		}
	}

	void assign_not(const self_ & a)
	{
		auto m = simd_v_.size() - !! unused_bits_;
		for (std::size_t i = 0; i < m; ++i)
		{
			simd_v_[i] = ~ a.simd_v_[i];
		}
		if (m < simd_v_.size())
		{
			simd_v_[m] = (~ a.simd_v_[m]) & (simd_all_ones_ >> unused_bits_);
		}
	}

	void assign_and(const self_ & a, const self_ & b)
	{
		for (std::size_t i = 0; i < simd_v_.size(); ++i)
		{
			simd_v_[i] = a.simd_v_[i] & b.simd_v_[i];
		}
	}

	void assign_or(const self_ & a, const self_ & b)
	{
		for (std::size_t i = 0; i < simd_v_.size(); ++i)
		{
			simd_v_[i] = a.simd_v_[i] | b.simd_v_[i];
		}
	}

	std::size_t count_ones() const
	{
		using byte = std::uint8_t;
		static const unsigned byte_bits = 8 * sizeof(byte);
		static byte_counter bc;
		auto p = data< byte>();
		auto n = simd_v_.size() * (simd_carrier_bits / byte_bits);
		std::size_t sum = 0;
		for (std::size_t i = 0; i < n; ++i)
		{
			sum += bc[ p[i]];
		}
		return sum;
	}

private:
	using simd_carrier_t = std::uint64_t;
	static const unsigned simd_carrier_bits = 8 * sizeof(simd_carrier_t);
	static constexpr simd_carrier_t simd_zero_ = 0ULL;
	static constexpr simd_carrier_t simd_one_ = 1ULL;
	static constexpr simd_carrier_t simd_all_ones_ = simd_carrier_t(-1LL);
	using simd_vector_t = std::vector< simd_carrier_t>;
	simd_vector_t simd_v_;
	unsigned unused_bits_;

	struct byte_counter
	{
		byte_counter()
			: a_{}
		{
			for (std::size_t i = 1; i & 0xFF; i <<= 1)
				for (std::size_t j = i; j < 256; j += 2 * i)
					for (std::size_t k = 0; k < i; ++k)
					{
						++a_[j + k];
					}
		}

		std::uint_least8_t operator[](std::size_t i) const
		{
			return a_[i];
		}
	private:
		std::array< std::uint_least8_t, 256> a_;
	};
};

struct policy_sse {
	using body_t = body64;
};

using policy_avx = policy_sse;
using policy_avx512 = policy_sse;

#endif

/**/
