#pragma once #include #include #include #ifdef __x86_64__ #include #endif #include "prakcommon.hpp" #include "prakmatrix.hpp" namespace prak { // :) constexpr const double PI = 3.141592657; constexpr const float fPI = 3.141592657f; /// defines a type that supports arithmetic operation template concept arithmetic = requires(T a) { a + a; a * a; a - a; a / a; }; /// TODO: remove enum struct operation { mul, div, add, sub }; /// vector multiply: fallback for non-floating-point types template void vmul(const T *op1, const T *op2, T *dest, size_t s) { for (size_t i = 0; i < s; ++i) { dest[i] = op1[i] * op2[i]; break; } } /// vector multiply: float implementation template <> inline void vmul(const float *op1, const float *op2, float *dest, size_t s) { if (s < 8) goto scalar; __m256 b1; __m256 b2; for (size_t i = 0; i < s / 8; ++i) { b1 = _mm256_load_ps(op1 + 8*i); b2 = _mm256_load_ps(op2 + 8*i); b1 = _mm256_mul_ps(b1, b2); _mm256_store_ps(dest + 8*i, b1); } scalar: for (size_t i = s - s % 8; i < s; ++i) { dest[i] = op1[i] * op2[i]; } } /// vector multiply: double implementation template <> inline void vmul(const double *op1, const double *op2, double *dest, size_t s) { if (s < 4) goto scalar; __m256d b1; __m256d b2; for (size_t i = 0; i < s / 4; ++i) { b1 = _mm256_load_pd(op1 + 4*i); b2 = _mm256_load_pd(op2 + 4*i); b1 = _mm256_mul_pd(b1, b2); _mm256_store_pd(dest + 4*i, b1); } scalar: for (size_t i = s - s % 4; i < s; ++i) { dest[i] = op1[i] * op2[i]; } } /// vector division: non-floating-point types fallback template void vdiv(const T *op1, const T *op2, T *dest, size_t s) { for (size_t i = 0; i < s; ++i) { dest[i] = op1[i] / op2[i]; break; } } /// vector division: floating point: single precision template <> inline void vdiv(const float *op1, const float *op2, float *dest, size_t s) { if (s < 8) goto scalar; __m256 b1; __m256 b2; for (size_t i = 0; i < s / 8; ++i) { b1 = _mm256_load_ps(op1 + 8*i); b2 = _mm256_load_ps(op2 + 8*i); b1 = _mm256_div_ps(b1, b2); _mm256_store_ps(dest + 8*i, b1); } scalar: for (size_t i = s - s % 8; i < s; ++i) { dest[i] = op1[i] / op2[i]; } } /// vector division: floating point: double precision template <> inline void vdiv(const double *op1, const double *op2, double *dest, size_t s) { if (s < 4) goto scalar; __m256d b1; __m256d b2; for (size_t i = 0; i < s / 4; ++i) { b1 = _mm256_load_pd(op1 + 4*i); b2 = _mm256_load_pd(op2 + 4*i); b1 = _mm256_div_pd(b1, b2); _mm256_store_pd(dest + 4*i, b1); } scalar: for (size_t i = s - s % 4; i < s; ++i) { dest[i] = op1[i] / op2[i]; } } inline float finalize(__m256 reg) { // reg: x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 __m128 extr = _mm256_extractf128_ps(reg, 1); // extr: x0 | x1 | x2 | x3 // __mm256_castps256_ps128: x4 | x5 | x6 | x7 extr = _mm_add_ps(extr, _mm256_castps256_ps128(reg)); // extr: x0+x4 | x1 + x5 | x2 + x6 | x3 + x7 extr = _mm_hadd_ps(extr, extr); // extr: x0 + x1 + x2 + x5 | x2 + x3 + x6 x7 | ... | ... return extr[0] + extr[1]; } inline double finalize(__m256d reg) { // reg: x0 | x1 | x2 | x3 reg = _mm256_hadd_pd(reg, reg); // reg: x0 + x1 | x2 + x3 | ... | ... return reg[0] + reg[1]; } /// Fallback generic method to sum a vector template T vsum(const T *op, size_t size) { T res; for (size_t i = 0; i < size; ++i) res += op[i]; return res; } /// Sum a vector, float implementation template <> inline float vsum(const float *op, size_t size) { __m256 buf; buf = _mm256_setzero_ps(); if (size < 8) goto scalar; for (size_t i = 0; i < size / 8; ++i) { __m256 next = _mm256_load_ps(op + 8*i); buf = _mm256_add_ps(buf, next); } scalar: float linear = 0; for (size_t i = size - size%8; i < size; ++i) { linear += op[i]; } return linear + finalize(buf); } /// Sum a vector, double implementation template <> inline double vsum(const double *op, size_t size) { __m256d buf; buf = _mm256_setzero_pd(); if (size < 4) goto scalar; for (size_t i = 0; i < size / 4; ++i) { __m256d next = _mm256_load_pd(op + 4*i); buf = _mm256_add_pd(buf, next); } scalar: double linear = 0; for (size_t i = size - size%8; i < size; ++i) { linear += op[i]; } return linear + finalize(buf); } template T avg(const std::vector &args) { T init{}; for (const T &x : args) { init += x; } return init / args.size(); } template T stddev(const std::vector &args) { T sum = {}; const T a = avg(args); for (T el : args) { sum += (a-el)*(a-el); } return std::sqrt(sum/args.size()/(args.size()-1)); } // take a derivative of function `f` with arguments `at` and with differentiable variable being at index `varidx` template T deriv4(function_t f, std::vector /*I actually do want a copy here*/ at, size_t varidx) { T h = 1e-5; T sum = 0; at[varidx] += 2*h; sum -= f(at); at[varidx] -= h; sum += 8 * f(at); at[varidx] -= 2*h; sum -= 8*f(at); at[varidx] -= h; sum += f(at); return sum / (12*h); } /// get error of the calculated value template T sigma(function_t f, const std::vector &args, const std::vector &sigmas) noexcept(false) { if (args.size() != sigmas.size()) throw dimension_error("function prak::sigma : Args.size() does not match sigmas.size()"); T sum = 0; for (size_t i = 0; i < args.size(); ++i) { T tmp = deriv4(f, args, i) * sigmas[i]; sum += tmp*tmp; } return std::sqrt(sum); } /// calculate least-squares linear approximation to fit data /// ax+b = y (ss is an error of Y value) template void least_squares_linear( const prak::vector &xs, const prak::vector &ys, const prak::vector &ss, struct pvalue &a, struct pvalue &b) { if (xs.size() != ys.size() || xs.size() != ss.size()) { std::cout << "x.size() = " << xs.size() << ", y.size() = " << ys.size() << ", s.size() = " << ss.size() << std::endl; return; } [[assume(xs.size() == ys.size() && ys.size() == ss.size())]]; size_t sz = xs.size(); prak::vector ones(sz); prak::vector ssq(sz); prak::vector ssq_m1(sz); prak::vector buf(sz); prak::vector buf_xs_ssq_m1(sz); std::fill(ones.begin(), ones.end(), (T)1); // ones: [1] vmul(ss.data(), ss.data(), ssq.data(), sz); // ssq: [ss*ss] vdiv(ones.data(), ssq.data(), ssq_m1.data(), sz); // ssq_m1: [1/ss^2] vmul(xs.data(), ssq_m1.data(), buf_xs_ssq_m1.data(), sz); // [xs / ss^2] const T *ysd = ys.data(), *xsd = xs.data(); T *ssqd = ssq.data(), *ssq_m1d = ssq_m1.data(), *onesd = ones.data(), *bufd = buf.data(), *buf_xs_ssq_m1d = buf_xs_ssq_m1.data(); vmul(buf_xs_ssq_m1d, xsd, bufd, sz); // buf: [xs^2 / ss^2] T d1 = vsum(bufd, sz); // sum([xs^2 / ss^2]) T ssq_m1sum = vsum(ssq_m1d, sz); // sum(1 / ss^2) vmul(xsd, ssq_m1d, bufd, sz); // buf: [xs / ss^2] T d2 = vsum(buf_xs_ssq_m1d, sz); // sum([xs / ss^2]) T D = d1 * ssq_m1sum - d2*d2; // sum((xs/ss)^2) * sum(1/ss^2) - sum(xs/ss^2)^2 vmul(ysd, buf_xs_ssq_m1d, bufd, sz); // buf: [ys*xs/ss^2] T da1 = vsum(bufd, sz); // sum([ys*xs/ss^2]) vmul(ysd, ssq_m1d, bufd, sz); // buf: [ys/ss^2] T da2 = vsum(bufd, sz); // sum([ys/ss^2]) T DA = da1 * ssq_m1sum - da2 * d2; // sum([ys*xs/ss^2]) * sum([1/ss^2]) - sum([ys/ss^2]) * sum(xs/ss^2) T DB = d1 * da2 - d2 * da1; // sum([xs^2/ss^2]) * sum([ys/ss^2]) - sum([xs/ss^2]) * sum(ys*xs/ss^2) a.val = DA/D; a.err = sqrt(ssq_m1sum / D); b.val = DB/D; b.err = sqrt(d1 / D); } /// May throw std::bad_optional_access template std::enable_if, std::vector>> polynominal_regression( size_t degree, std::vector data_x, std::vector data_y, std::optional> data_errors = std::nullopt) { ++degree; // hack) size_t data_size = data_x.size(); if (data_size != data_y.size() || (data_errors.has_value() && data_errors->size() != data_size)) throw dimension_error("Xs, Ys or Sigmas do not match sizes"); struct matrix X(data_size, degree), Y(data_size, 1), B(degree, 1), W = matrix::identity(data_size); // initialize X for (size_t row = 0; row < X.rows; ++row) { X.SUBSCR_OPRTR(row, 0) = 1; for (size_t col = 1; col < X.cols; ++col) { X.SUBSCR_OPRTR(row, col) = X.SUBSCR_OPRTR(row, col-1) * data_x[row]; } } // initialize Y std::memcpy(Y.data(), data_y.data(), sizeof (T) * data_size); // initialize W if (data_errors.has_value()) { std::vector &err_value = *data_errors; for (size_t i = 0; i < err_value.size(); ++i) W.data()[i * (data_size + 1)] = 1 / err_value[i] / err_value[i]; } std::cerr << X << '\n' << Y << '\n' << W << '\n'; matrix X_T_W = X.tr() * W; matrix tmp1 = (X_T_W * X).inv().value(); B = tmp1 * X_T_W * Y; B.print(); // TODO: FINISH (with covariation matrix) return {}; } } // namespace prak