From 25750d59cc2a7156cd0b9e63ef04ef75f2b92a86 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Fri, 6 Oct 2017 00:26:45 +0300 Subject: [PATCH] Added kmeansiou.c --- scripts/kmeansiou.c | 392 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 392 insertions(+) create mode 100644 scripts/kmeansiou.c diff --git a/scripts/kmeansiou.c b/scripts/kmeansiou.c new file mode 100644 index 00000000..02f9e9a5 --- /dev/null +++ b/scripts/kmeansiou.c @@ -0,0 +1,392 @@ +//usr/bin/cc -Ofast -lm "${0}" -o "${0%.c}" && ./"${0%.c}" "$@"; s=$?; rm ./"${0%.c}"; exit $s + +#include +#include +#include +#include +#include + +typedef struct matrix{ + int rows, cols; + double **vals; +} matrix; + +matrix csv_to_matrix(char *filename, int header); +matrix make_matrix(int rows, int cols); +void zero_matrix(matrix m); + +void copy(double *x, double *y, int n); +double dist(double *x, double *y, int n); +int *sample(int n); + +int find_int_arg(int argc, char **argv, char *arg, int def); +int find_arg(int argc, char* argv[], char *arg); + +int closest_center(double *datum, matrix centers) +{ + int j; + int best = 0; + double best_dist = dist(datum, centers.vals[best], centers.cols); + for(j = 0; j < centers.rows; ++j){ + double new_dist = dist(datum, centers.vals[j], centers.cols); + if(new_dist < best_dist){ + best_dist = new_dist; + best = j; + } + } + return best; +} + +double dist_to_closest_center(double *datum, matrix centers) +{ + int ci = closest_center(datum, centers); + return dist(datum, centers.vals[ci], centers.cols); +} + +int kmeans_expectation(matrix data, int *assignments, matrix centers) +{ + int i; + int converged = 1; + for(i = 0; i < data.rows; ++i){ + int closest = closest_center(data.vals[i], centers); + if(closest != assignments[i]) converged = 0; + assignments[i] = closest; + } + return converged; +} + +void kmeans_maximization(matrix data, int *assignments, matrix centers) +{ + int i,j; + int *counts = calloc(centers.rows, sizeof(int)); + zero_matrix(centers); + for(i = 0; i < data.rows; ++i){ + ++counts[assignments[i]]; + for(j = 0; j < data.cols; ++j){ + centers.vals[assignments[i]][j] += data.vals[i][j]; + } + } + for(i = 0; i < centers.rows; ++i){ + if(counts[i]){ + for(j = 0; j < centers.cols; ++j){ + centers.vals[i][j] /= counts[i]; + } + } + } +} + +double WCSS(matrix data, int *assignments, matrix centers) +{ + int i, j; + double sum = 0; + + for(i = 0; i < data.rows; ++i){ + int ci = assignments[i]; + sum += (1 - dist(data.vals[i], centers.vals[ci], data.cols)); + } + return sum / data.rows; +} + +typedef struct{ + int *assignments; + matrix centers; +} model; + +void smart_centers(matrix data, matrix centers) { + int i,j; + copy(data.vals[rand()%data.rows], centers.vals[0], data.cols); + double *weights = calloc(data.rows, sizeof(double)); + int clusters = centers.rows; + for (i = 1; i < clusters; ++i) { + double sum = 0; + centers.rows = i; + for (j = 0; j < data.rows; ++j) { + weights[j] = dist_to_closest_center(data.vals[j], centers); + sum += weights[j]; + } + double r = sum*((double)rand()/RAND_MAX); + for (j = 0; j < data.rows; ++j) { + r -= weights[j]; + if(r <= 0){ + copy(data.vals[j], centers.vals[i], data.cols); + break; + } + } + } + free(weights); +} + +void random_centers(matrix data, matrix centers){ + int i; + int *s = sample(data.rows); + for(i = 0; i < centers.rows; ++i){ + copy(data.vals[s[i]], centers.vals[i], data.cols); + } + free(s); +} + +model do_kmeans(matrix data, int k) +{ + matrix centers = make_matrix(k, data.cols); + int *assignments = calloc(data.rows, sizeof(int)); + smart_centers(data, centers); + //random_centers(data, centers); + if(k == 1) kmeans_maximization(data, assignments, centers); + while(!kmeans_expectation(data, assignments, centers)){ + kmeans_maximization(data, assignments, centers); + } + model m; + m.assignments = assignments; + m.centers = centers; + return m; +} + +int main(int argc, char *argv[]) +{ + if(argc < 3){ + fprintf(stderr, "usage: %s [points/centers/stats]\n", argv[0]); + return 0; + } + int i,j; + srand(time(0)); + matrix data = csv_to_matrix(argv[1], 0); + int k = find_int_arg(argc, argv, "-k", 2); + int header = find_arg(argc, argv, "-h"); + int count = find_arg(argc, argv, "-c"); + + if(strcmp(argv[2], "assignments")==0){ + model m = do_kmeans(data, k); + int *assignments = m.assignments; + for(i = 0; i < k; ++i){ + if(i != 0) printf("-\n"); + for(j = 0; j < data.rows; ++j){ + if(!(assignments[j] == i)) continue; + printf("%f, %f\n", data.vals[j][0], data.vals[j][1]); + } + } + }else if(strcmp(argv[2], "centers")==0){ + model m = do_kmeans(data, k); + printf("WCSS: %f\n", WCSS(data, m.assignments, m.centers)); + int *counts = 0; + if(count){ + counts = calloc(k, sizeof(int)); + for(j = 0; j < data.rows; ++j){ + ++counts[m.assignments[j]]; + } + } + for(j = 0; j < m.centers.rows; ++j){ + if(count) printf("%d, ", counts[j]); + printf("%f, %f\n", m.centers.vals[j][0], m.centers.vals[j][1]); + } + }else if(strcmp(argv[2], "scan")==0){ + for(i = 1; i <= k; ++i){ + model m = do_kmeans(data, i); + printf("%f\n", WCSS(data, m.assignments, m.centers)); + } + } + return 0; +} + +// Utility functions + +int *sample(int n) +{ + int i; + int *s = calloc(n, sizeof(int)); + for(i = 0; i < n; ++i) s[i] = i; + for(i = n-1; i >= 0; --i){ + int swap = s[i]; + int index = rand()%(i+1); + s[i] = s[index]; + s[index] = swap; + } + return s; +} + +double dist(double *x, double *y, int n) +{ + int i; + double mw = (x[0] < y[0]) ? x[0] : y[0]; + double mh = (x[1] < y[1]) ? x[1] : y[1]; + double inter = mw*mh; + double sum = x[0]*x[1] + y[0]*y[1]; + double un = sum - inter; + double iou = inter/un; + return 1-iou; +} + +void copy(double *x, double *y, int n) +{ + int i; + for(i = 0; i < n; ++i) y[i] = x[i]; +} + +void error(char *s){ + fprintf(stderr, "Error: %s\n", s); + exit(0); +} + +char *fgetl(FILE *fp) +{ + if(feof(fp)) return 0; + int size = 512; + char *line = malloc(size*sizeof(char)); + if(!fgets(line, size, fp)){ + free(line); + return 0; + } + + int curr = strlen(line); + + while(line[curr-1]!='\n'){ + size *= 2; + line = realloc(line, size*sizeof(char)); + if(!line) error("Malloc"); + fgets(&line[curr], size-curr, fp); + curr = strlen(line); + } + line[curr-1] = '\0'; + + return line; +} + +// Matrix stuff + +int count_fields(char *line) +{ + int count = 0; + int done = 0; + char *c; + for(c = line; !done; ++c){ + done = (*c == '\0'); + if(*c == ',' || done) ++count; + } + return count; +} + +double *parse_fields(char *l, int n) +{ + int i; + double *field = calloc(n, sizeof(double)); + for(i = 0; i < n; ++i){ + field[i] = atof(l); + l = strchr(l, ',')+1; + } + return field; +} + +matrix make_matrix(int rows, int cols) +{ + matrix m; + m.rows = rows; + m.cols = cols; + m.vals = calloc(m.rows, sizeof(double *)); + int i; + for(i = 0; i < m.rows; ++i) m.vals[i] = calloc(m.cols, sizeof(double)); + return m; +} + +void zero_matrix(matrix m) +{ + int i, j; + for(i = 0; i < m.rows; ++i){ + for(j = 0; j < m.cols; ++j) m.vals[i][j] = 0; + } +} + +matrix csv_to_matrix(char *filename, int header) +{ + FILE *fp = fopen(filename, "r"); + if(!fp) error(filename); + + matrix m; + m.cols = -1; + + char *line; + + int n = 0; + int size = 1024; + m.vals = calloc(size, sizeof(double*)); + if(header) fgetl(fp); + while((line = fgetl(fp))){ + if(m.cols == -1) m.cols = count_fields(line); + if(n == size){ + size *= 2; + m.vals = realloc(m.vals, size*sizeof(double*)); + } + m.vals[n] = parse_fields(line, m.cols); + free(line); + ++n; + } + m.vals = realloc(m.vals, n*sizeof(double*)); + m.rows = n; + return m; +} + +// Arguement parsing + +void del_arg(int argc, char **argv, int index) +{ + int i; + for(i = index; i < argc-1; ++i) argv[i] = argv[i+1]; + argv[i] = 0; +} + +int find_arg(int argc, char* argv[], char *arg) +{ + int i; + for(i = 0; i < argc; ++i) { + if(!argv[i]) continue; + if(0==strcmp(argv[i], arg)) { + del_arg(argc, argv, i); + return 1; + } + } + return 0; +} + +int find_int_arg(int argc, char **argv, char *arg, int def) +{ + int i; + for(i = 0; i < argc-1; ++i){ + if(!argv[i]) continue; + if(0==strcmp(argv[i], arg)){ + def = atoi(argv[i+1]); + del_arg(argc, argv, i); + del_arg(argc, argv, i); + break; + } + } + return def; +} + +float find_float_arg(int argc, char **argv, char *arg, float def) +{ + int i; + for(i = 0; i < argc-1; ++i){ + if(!argv[i]) continue; + if(0==strcmp(argv[i], arg)){ + def = atof(argv[i+1]); + del_arg(argc, argv, i); + del_arg(argc, argv, i); + break; + } + } + return def; +} + +char *find_char_arg(int argc, char **argv, char *arg, char *def) +{ + int i; + for(i = 0; i < argc-1; ++i){ + if(!argv[i]) continue; + if(0==strcmp(argv[i], arg)){ + def = argv[i+1]; + del_arg(argc, argv, i); + del_arg(argc, argv, i); + break; + } + } + return def; +} +