-
Notifications
You must be signed in to change notification settings - Fork 2
/
predict.c
178 lines (134 loc) · 4.16 KB
/
predict.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#include <stdio.h>
#include <stdint.h>
#include <x86intrin.h>
#include "libpopcnt.h"
int _get_cpuid() {
#if defined(__cplusplus)
/* C++11 thread-safe singleton */
static const int cpuid = get_cpuid();
#else
static int cpuid_ = -1;
int cpuid = cpuid_;
if (cpuid == -1)
{
cpuid = get_cpuid();
__sync_val_compare_and_swap(&cpuid_, -1, cpuid);
}
#endif
return cpuid;
}
/*
* Count the number of 1 bits in the data array
* @data: An array
* @size: Size of data in bytes
* @cpuid: Result of the cpuid call
*/
static inline uint64_t popcnt_no_cpuid(const void* data, uint64_t size, int cpuid) {
const uint8_t* ptr = (const uint8_t*) data;
uint64_t cnt = 0;
uint64_t i;
#if defined(HAVE_AVX2)
/* AVX2 requires arrays >= 512 bytes */
if ((cpuid & bit_AVX2) &&
size >= 512)
{
align_avx2(&ptr, &size, &cnt);
cnt += popcnt_avx2((const __m256i*) ptr, size / 32);
ptr += size - size % 32;
size = size % 32;
}
#endif
#if defined(HAVE_POPCNT)
if (cpuid & bit_POPCNT)
{
cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
ptr += size - size % 8;
size = size % 8;
for (i = 0; i < size; i++)
cnt += popcnt64(ptr[i]);
return cnt;
}
#endif
/* pure integer popcount algorithm */
for (i = 0; i < size; i++)
cnt += popcount64(ptr[i]);
return cnt;
}
void predict_float_256(float* user_vector,
float* item_vectors,
float user_bias,
float* item_biases,
float* out,
intptr_t num_items,
intptr_t latent_dim) {
float* item_vector;
__m256 x, y, prediction;
float scalar_prediction;
float unpacked[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int j;
for (int i = 0; i < num_items; i++) {
prediction = _mm256_setzero_ps();
scalar_prediction = item_biases[i] + user_bias;
item_vector = item_vectors + (i * latent_dim);
for (j = 0; j + 8 <= latent_dim; j += 8) {
x = _mm256_load_ps(item_vector + j);
y = _mm256_load_ps(user_vector + j);
prediction = _mm256_fmadd_ps(x, y, prediction);
}
_mm256_store_ps(unpacked, prediction);
for (int k = 0; k < 8; k++) {
scalar_prediction += unpacked[k];
}
// Remainder
for (; j < latent_dim; j++) {
scalar_prediction += item_vector[j] * user_vector[j];
}
out[i] = scalar_prediction;
}
}
void predict_xnor_256(int32_t* user_vector,
int32_t* item_vectors,
float user_bias,
float* item_biases,
float user_norm,
float* item_norms,
float* out,
intptr_t num_items,
intptr_t latent_dim) {
int32_t* item_vector;
int j;
__m256i x, y, xnor;
float scalar_prediction;
unsigned int on_bits;
int32_t bits[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int cpuid = _get_cpuid();
float max_on_bits = latent_dim * 32;
__m256i allbits = _mm256_cmpeq_epi32(
_mm256_setzero_si256(),
_mm256_setzero_si256());
for (int i = 0; i < num_items; i++) {
item_vector = item_vectors + (i * latent_dim);
scalar_prediction = 0;
on_bits = 0;
j = 0;
for (; j + 8 <= latent_dim; j += 8) {
// Load
x = _mm256_load_si256(item_vector + j);
y = _mm256_load_si256(user_vector + j);
// XNOR
xnor = _mm256_xor_si256(_mm256_xor_si256(x, y), allbits);
_mm256_store_si256(bits, xnor);
// Bitcount
on_bits += popcnt_no_cpuid((const void*) bits,
8 * sizeof(float), cpuid);
}
for (; j < latent_dim; j++) {
on_bits += __builtin_popcount(~(user_vector[j] ^ item_vector[j]));
}
// Scaling
scalar_prediction = (on_bits - (max_on_bits - on_bits))
* user_norm * item_norms[i];
// Biases
out[i] = scalar_prediction + user_bias + item_biases[i];
}
}