################### 1. Implement Matrix Multiplication algorithm through i) Sequential Algorithm, ii) Parallel Algorithm using OpenMP #############

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#define IDX(i, j, n) ((i) * (n) + (j))

void fill_random(double *A, int n) {
    for (int i = 0; i < n * n; i++) {
        A[i] = (double)(rand() % 10);
    }
}

void matmul_serial(const double *A, const double *B, double *C, int n) {
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            double sum = 0.0;
            for (int k = 0; k < n; k++) {
                sum += A[IDX(i, k, n)] * B[IDX(k, j, n)];
            }
            C[IDX(i, j, n)] = sum;
        }
    }
}

void matmul_parallel(const double *A, const double *B, double *C, int n) {
    #pragma omp parallel for
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            double sum = 0.0;
            for (int k = 0; k < n; k++) {
                sum += A[IDX(i, k, n)] * B[IDX(k, j, n)];
            }
            C[IDX(i, j, n)] = sum;
        }
    }
}

int main() {
    srand(42);

    int n = 4;
    double *A = (double *)malloc(n * n * sizeof(double));
    double *B = (double *)malloc(n * n * sizeof(double));
    double *C = (double *)malloc(n * n * sizeof(double));

    fill_random(A, n);
    fill_random(B, n);
    matmul_parallel(A, B, C, n);

    free(A);
    free(B);
    free(C);

    int sizes[] = {100, 200, 300, 400};
    int count = sizeof(sizes) / sizeof(sizes[0]);

    for (int s = 0; s < count; s++) {
        n = sizes[s];

        A = (double *)malloc(n * n * sizeof(double));
        B = (double *)malloc(n * n * sizeof(double));
        double *C1 = (double *)malloc(n * n * sizeof(double));
        double *C2 = (double *)malloc(n * n * sizeof(double));

        fill_random(A, n);
        fill_random(B, n);

        double t1 = omp_get_wtime();
        matmul_serial(A, B, C1, n);
        double t2 = omp_get_wtime();

        double t3 = omp_get_wtime();
        matmul_parallel(A, B, C2, n);
        double t4 = omp_get_wtime();

        printf("Size %d x %d: Sequential = %.6f s, Parallel = %.6f s\n",
               n, n, t2 - t1, t4 - t3);

        free(A);
        free(B);
        free(C1);
        free(C2);
    }

    return 0;
}


gcc -fopenmp matmul_openmp.c -o matmul_openmp
./matmul_openmp


#################### 2. Implement Circuit Satisfiability Problem and comment on NP-Hard by increasing number of input to Boolean circuit######################

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

int circuit(unsigned long long value, int n) {
    int result = 0;

    for (int i = 0; i < n - 1; i += 2) {
        int xi = (value >> i) & 1;
        int xj = (value >> (i + 1)) & 1;
        result = result || (xi && !xj);
    }

    return result;
}

int main() {
    int n;
    printf("Enter number of Boolean variables: ");
    scanf("%d", &n);

    unsigned long long total = 1ULL << n;
    int found = 0;

    double start = omp_get_wtime();

    #pragma omp parallel for reduction(||:found)
    for (unsigned long long i = 0; i < total; i++) {
        if (circuit(i, n)) {
            found = 1;
        }
    }

    double end = omp_get_wtime();

    printf("Circuit SAT Result: %s\n", found ? "SATISFIABLE" : "UNSATISFIABLE");
    printf("Execution Time: %.6f seconds\n", end - start);

    return 0;
}



gcc -fopenmp circuit_sat_openmp.c -o circuit_sat

export OMP_NUM_THREADS=8
./circuit_sat


################## 3. Quick Sort Algorithm using OpenMP ######################

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>

#define CUTOFF 100

int partition(int arr[], int low, int high) {
    int pivot = arr[high];
    int i = low - 1;
    for (int j = low; j < high; j++) {
        if (arr[j] <= pivot) {
            i++;
            int t = arr[i];
            arr[i] = arr[j];
            arr[j] = t;
        }
    }
    int t = arr[i + 1];
    arr[i + 1] = arr[high];
    arr[high] = t;
    return i + 1;
}

void quicksort_serial(int arr[], int low, int high) {
    if (low < high) {
        int pi = partition(arr, low, high);
        quicksort_serial(arr, low, pi - 1);
        quicksort_serial(arr, pi + 1, high);
    }
}

void quicksort_parallel(int arr[], int low, int high) {
    if (low < high) {
        if (high - low < CUTOFF) {
            quicksort_serial(arr, low, high);
            return;
        }

        int pi = partition(arr, low, high);

        #pragma omp task shared(arr)
        quicksort_parallel(arr, low, pi - 1);

        #pragma omp task shared(arr)
        quicksort_parallel(arr, pi + 1, high);

        #pragma omp taskwait
    }
}

void fill_random(int arr[], int n) {
    for (int i = 0; i < n; i++) arr[i] = rand() % 10000;
}

void print_array(int arr[], int n) {
    for (int i = 0; i < n; i++) {
        printf("%d ", arr[i]);
    }
    printf("\n");
}

int main() {
    srand(42);

    int small[10];
    fill_random(small, 10);

    int small_parallel[10];
    memcpy(small_parallel, small, sizeof(small));

    #pragma omp parallel
    {
        #pragma omp single
        quicksort_parallel(small_parallel, 0, 9);
    }

    print_array(small_parallel, 10);

    int sizes[] = {1000, 2000, 3000, 4000};

    for (int s = 0; s < 4; s++) {
        int n = sizes[s];

        int *a = (int *)malloc(n * sizeof(int));
        int *b = (int *)malloc(n * sizeof(int));

        fill_random(a, n);
        memcpy(b, a, n * sizeof(int));

        double start = omp_get_wtime();
        quicksort_serial(a, 0, n - 1);
        double serial_time = omp_get_wtime() - start;

        start = omp_get_wtime();
        #pragma omp parallel
        {
            #pragma omp single
            quicksort_parallel(b, 0, n - 1);
        }
        double parallel_time = omp_get_wtime() - start;

        printf("Size %d: Sequential time = %f s, Parallel time = %f s\n",
               n, serial_time, parallel_time);

        free(a);
        free(b);
    }

    return 0;
}


nano quicksort_omp.c
gcc -O2 -fopenmp quicksort_omp.c -o quicksort_omp
./quicksort_omp


################## Implement Linear Systems solution using Conjugate Gradient Method through Parallell OpenMP programming and then analyze the algorithm ########################33

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <omp.h>

static double wall_time(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

static double dot_serial(const double *a, const double *b, int n) {
    double s = 0.0;
    for (int i = 0; i < n; i++)
        s += a[i] * b[i];
    return s;
}

static double dot_parallel(const double *a, const double *b, int n) {
    double s = 0.0;
    #pragma omp parallel for reduction(+:s) schedule(static)
    for (int i = 0; i < n; i++)
        s += a[i] * b[i];
    return s;
}

static void matvec_serial(const double *A, const double *x, double *y, int n) {
    for (int i = 0; i < n; i++) {
        double s = 0.0;
        for (int j = 0; j < n; j++)
            s += A[i * n + j] * x[j];
        y[i] = s;
    }
}

static void matvec_parallel(const double *A, const double *x, double *y, int n) {
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < n; i++) {
        double s = 0.0;
        for (int j = 0; j < n; j++)
            s += A[i * n + j] * x[j];
        y[i] = s;
    }
}

int cg_serial(const double *A, const double *b, double *x,
              int n, double tol, int maxiter, int *iters) {
    double *r = malloc(n * sizeof(double));
    double *p = malloc(n * sizeof(double));
    double *Ap = malloc(n * sizeof(double));

    memset(x, 0, n * sizeof(double));
    memcpy(r, b, n * sizeof(double));
    memcpy(p, b, n * sizeof(double));

    double rr = dot_serial(r, r, n);
    int k = 0;

    for (k = 0; k < maxiter; k++) {
        if (sqrt(rr) < tol) break;

        matvec_serial(A, p, Ap, n);
        double pAp = dot_serial(p, Ap, n);
        double alpha = rr / pAp;

        for (int i = 0; i < n; i++)
            x[i] += alpha * p[i];

        for (int i = 0; i < n; i++)
            r[i] -= alpha * Ap[i];

        double rr_new = dot_serial(r, r, n);
        double beta = rr_new / rr;
        rr = rr_new;

        for (int i = 0; i < n; i++)
            p[i] = r[i] + beta * p[i];
    }

    *iters = k;

    free(r);
    free(p);
    free(Ap);

    return (sqrt(rr) < tol) ? 0 : 1;
}

int cg_parallel(const double *A, const double *b, double *x,
                int n, double tol, int maxiter, int *iters) {
    double *r = malloc(n * sizeof(double));
    double *p = malloc(n * sizeof(double));
    double *Ap = malloc(n * sizeof(double));

    memset(x, 0, n * sizeof(double));
    memcpy(r, b, n * sizeof(double));
    memcpy(p, b, n * sizeof(double));

    double rr = dot_parallel(r, r, n);
    int k = 0;

    for (k = 0; k < maxiter; k++) {
        if (sqrt(rr) < tol) break;

        matvec_parallel(A, p, Ap, n);
        double pAp = dot_parallel(p, Ap, n);
        double alpha = rr / pAp;

        #pragma omp parallel for schedule(static)
        for (int i = 0; i < n; i++) {
            x[i] += alpha * p[i];
            r[i] -= alpha * Ap[i];
        }

        double rr_new = dot_parallel(r, r, n);
        double beta = rr_new / rr;
        rr = rr_new;

        #pragma omp parallel for schedule(static)
        for (int i = 0; i < n; i++)
            p[i] = r[i] + beta * p[i];
    }

    *iters = k;

    free(r);
    free(p);
    free(Ap);

    return (sqrt(rr) < tol) ? 0 : 1;
}

void make_spd(double *A, double *b, int n, unsigned seed) {
    srand(seed);
    double *L = calloc(n * n, sizeof(double));

    for (int i = 0; i < n; i++)
        for (int j = 0; j <= i; j++)
            L[i * n + j] = (double)rand() / RAND_MAX;

    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++) {
            double s = 0.0;
            for (int k = 0; k <= (i < j ? i : j); k++)
                s += L[i * n + k] * L[j * n + k];
            A[i * n + j] = s + (i == j ? n : 0);
        }

    for (int i = 0; i < n; i++)
        b[i] = (double)rand() / RAND_MAX * 10.0;

    free(L);
}

int main(void) {
    printf("OpenMP threads available: %d\n\n", omp_get_max_threads());

    int sizes[] = {256, 512, 1024, 2048};
    int nsizes = sizeof(sizes) / sizeof(sizes[0]);

    printf("%-6s %-14s %-14s %-8s\n", "N", "Serial(ms)", "Parallel(ms)", "Iters");

    for (int si = 0; si < nsizes; si++) {
        int n = sizes[si];

        double *A = malloc((size_t)n * n * sizeof(double));
        double *b = malloc(n * sizeof(double));
        double *xs = malloc(n * sizeof(double));
        double *xp = malloc(n * sizeof(double));

        make_spd(A, b, n, 42 + si);

        int its, itp;
        double t0;

        t0 = wall_time();
        cg_serial(A, b, xs, n, 1e-8, 5000, &its);
        double ts = (wall_time() - t0) * 1e3;

        t0 = wall_time();
        cg_parallel(A, b, xp, n, 1e-8, 5000, &itp);
        double tp = (wall_time() - t0) * 1e3;

        printf("%-6d %-14.3f %-14.3f %-8d\n", n, ts, tp, its);

        free(A);
        free(b);
        free(xs);
        free(xp);
    }

    return 0;
}


gcc -O2 -fopenmp -o cg_openmp cg_openmp.c -lm

OMP_NUM_THREADS=8
./cg_openmp

################# 5. Implement Matrix Multiplication algorithm through i) Sequential Algorithm, ii) Parallel Algorithm using MPI ###################################


#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

static void fill_random(double *m, int n) {
    for (int i = 0; i < n * n; i++) m[i] = rand() % 10;
}

static void print_matrix(const char *name, const double *m, int n) {
    printf("%s\n", name);
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) printf("%.0f ", m[i * n + j]);
        printf("\n");
    }
}

static void sequential_multiply(const double *a, const double *b, double *c, int n) {
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            double sum = 0.0;
            for (int k = 0; k < n; k++) sum += a[i * n + k] * b[k * n + j];
            c[i * n + j] = sum;
        }
    }
}

static void parallel_multiply(const double *a_root, double *b, double *c_root, int n, MPI_Comm comm) {
    int rank, size;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);

    int base = n / size;
    int rem = n % size;

    int local_rows = base + (rank < rem ? 1 : 0);
    int local_count = local_rows * n;

    int *sendcounts = NULL;
    int *displs = NULL;

    if (rank == 0) {
        sendcounts = (int *)malloc(size * sizeof(int));
        displs = (int *)malloc(size * sizeof(int));

        int row_off = 0;
        for (int i = 0; i < size; i++) {
            int r = base + (i < rem ? 1 : 0);
            sendcounts[i] = r * n;
            displs[i] = row_off * n;
            row_off += r;
        }
    }

    double *local_a = (double *)malloc((local_count > 0 ? local_count : 1) * sizeof(double));
    double *local_c = (double *)malloc((local_count > 0 ? local_count : 1) * sizeof(double));

    MPI_Bcast(b, n * n, MPI_DOUBLE, 0, comm);
    MPI_Scatterv((void *)a_root, sendcounts, displs, MPI_DOUBLE,
                 local_a, local_count, MPI_DOUBLE, 0, comm);

    for (int i = 0; i < local_rows; i++) {
        for (int j = 0; j < n; j++) {
            double sum = 0.0;
            for (int k = 0; k < n; k++) sum += local_a[i * n + k] * b[k * n + j];
            local_c[i * n + j] = sum;
        }
    }

    MPI_Gatherv(local_c, local_count, MPI_DOUBLE,
                c_root, sendcounts, displs, MPI_DOUBLE, 0, comm);

    free(local_a);
    free(local_c);
    if (rank == 0) {
        free(sendcounts);
        free(displs);
    }
}

static void run_case(int n, int demo, MPI_Comm comm, int rank) {
    double *a = NULL;
    double *b = (double *)malloc(n * n * sizeof(double));
    double *c_par = NULL;
    double *c_seq = NULL;

    if (rank == 0) {
        a = (double *)malloc(n * n * sizeof(double));
        c_par = (double *)malloc(n * n * sizeof(double));
        if (!demo) c_seq = (double *)malloc(n * n * sizeof(double));

        fill_random(a, n);
        fill_random(b, n);
    }

    if (!demo && rank == 0) {
        double t0 = MPI_Wtime();
        sequential_multiply(a, b, c_seq, n);
        double seq_time = MPI_Wtime() - t0;

        MPI_Barrier(comm);
        double p0 = MPI_Wtime();
        parallel_multiply(a, b, c_par, n, comm);
        double par_local = MPI_Wtime() - p0;

        double par_time = 0.0;
        MPI_Reduce(&par_local, &par_time, 1, MPI_DOUBLE, MPI_MAX, 0, comm);

        printf("%d x %d -> sequential: %.6f s, parallel: %.6f s\n",
               n, n, seq_time, par_time);
    } else if (demo) {
        MPI_Barrier(comm);
        parallel_multiply(a, b, c_par, n, comm);

        if (rank == 0) {
            print_matrix("A", a, n);
            print_matrix("B", b, n);
            print_matrix("C", c_par, n);
        }
    } else {
        MPI_Barrier(comm);
        parallel_multiply(a, b, c_par, n, comm);
    }

    free(b);
    if (rank == 0) {
        free(a);
        free(c_par);
        free(c_seq);
    }
}

int main(int argc, char **argv) {
    MPI_Init(&argc, &argv);

    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (rank == 0) srand(42);

    run_case(4, 1, MPI_COMM_WORLD, rank);

    int sizes[] = {100, 200, 300, 400};
    for (int i = 0; i < 4; i++) {
        run_case(sizes[i], 0, MPI_COMM_WORLD, rank);
    }

    MPI_Finalize();
    return 0;
}


mpicc -O2 mpi_matrix_mul.c -o mpi_matrix_mul
mpirun -np 4 ./mpi_matrix_mul

mpirun --allow-run-as-root -np 4 ./mpi_matrix_mul (if issue)


################# Implement (a) Prefix Sum using MPI  (b) Matrix Multiplication using OpenMP embedded in MPI #################################

########### Prefix Sum ############

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>

#define N 20

int main(int argc, char *argv[]) {
    int rank, size;
    int a[N], prefix[N];

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    int base = N / size;
    int rem  = N % size;

    int local_n = base + (rank < rem ? 1 : 0);
    int start = rank * base + (rank < rem ? rank : rem);

    if (rank == 0) {
        for (int i = 0; i < N; i++) a[i] = i + 1;
        printf("Initial array: ");
        for (int i = 0; i < N; i++) printf("%d ", a[i]);
        printf("\n");
    }

    MPI_Bcast(a, N, MPI_INT, 0, MPI_COMM_WORLD);

    int *local_prefix = (int *)malloc(local_n * sizeof(int));
    int local_sum = 0;

    for (int i = 0; i < local_n; i++) {
        local_sum += a[start + i];
        local_prefix[i] = local_sum;
    }

    int offset = 0;
    MPI_Exscan(&local_sum, &offset, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    if (rank == 0) offset = 0;

    for (int i = 0; i < local_n; i++) {
        local_prefix[i] += offset;
    }

    int *recvcounts = NULL, *displs = NULL;
    if (rank == 0) {
        recvcounts = (int *)malloc(size * sizeof(int));
        displs = (int *)malloc(size * sizeof(int));
        for (int r = 0; r < size; r++) {
            recvcounts[r] = base + (r < rem ? 1 : 0);
            displs[r] = r * base + (r < rem ? r : rem);
        }
    }

    MPI_Gatherv(local_prefix, local_n, MPI_INT,
                prefix, recvcounts, displs, MPI_INT,
                0, MPI_COMM_WORLD);

    if (rank == 0) {
        printf("Prefix sum:    ");
        for (int i = 0; i < N; i++) printf("%d ", prefix[i]);
        printf("\n");
        free(recvcounts);
        free(displs);
    }

    free(local_prefix);
    MPI_Finalize();
    return 0;
}


mpicc prefix_sum.c -o prefix_sum
mpirun -np 4 ./prefix_sum


######### Matrix ########

#include <mpi.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#define N 8

void print_matrix(int m[N][N], const char *name) {
    printf("%s:\n", name);
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) printf("%4d", m[i][j]);
        printf("\n");
    }
    printf("\n");
}

int main(int argc, char *argv[]) {
    int rank, size;
    int A[N][N], B[N][N], C[N][N];
    int localA[N][N], localC[N][N];
    int rows_per_rank;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    rows_per_rank = N / size;
    if (N % size != 0) {
        if (rank == 0) printf("N must be divisible by number of MPI processes\n");
        MPI_Finalize();
        return 0;
    }

    if (rank == 0) {
        srand(1);
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++) {
                A[i][j] = rand() % 10;
                B[i][j] = rand() % 10;
            }
    }

    MPI_Bcast(B, N * N, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Scatter(A, rows_per_rank * N, MPI_INT, localA, rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);

    for (int i = 0; i < rows_per_rank; i++) {
        for (int j = 0; j < N; j++) {
            localC[i][j] = 0;
            #pragma omp parallel for reduction(+:localC[i][j])
            for (int k = 0; k < N; k++) {
                localC[i][j] += localA[i][k] * B[k][j];
            }
        }
    }

    MPI_Gather(localC, rows_per_rank * N, MPI_INT, C, rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        print_matrix(A, "Matrix A");
        print_matrix(B, "Matrix B");
        print_matrix(C, "Matrix C = A x B");
    }

    MPI_Finalize();
    return 0;
}


mpicc -fopenmp hybrid_mm.c -o hybrid_mm
export OMP_NUM_THREADS=2
mpirun -np 4 ./hybrid_mm


################## 7. Write an OpenMP program such that, It should print the sum of square of the thread id’s. Also make sure that, each thread should print the square value of their thread id. ##############################

#include <stdio.h>
#include <omp.h>

int main() {
    int sum = 0;

    #pragma omp parallel reduction(+:sum)
    {
        int tid = omp_get_thread_num();
        int sq = tid * tid;

        #pragma omp critical
        printf("Thread %d square = %d\n", tid, sq);

        sum += sq;
    }

    printf("Sum of squares = %d\n", sum);
    return 0;
}


gcc -fopenmp omp_square.c -o omp_square
./omp_square




export OMP_NUM_THREADS=4
./omp_square


##################### 8. Implement Sieve of Eratosthenes with different data decomposition options using Parallel MPI programming ####################

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define N 100

void get_base_primes(int *base, int *count) {
    int limit = (int)sqrt(N);
    int temp[101];
    for (int i = 0; i <= limit; i++) temp[i] = 1;
    temp[0] = temp[1] = 0;

    for (int p = 2; p * p <= limit; p++) {
        if (temp[p]) {
            for (int i = p * p; i <= limit; i += p)
                temp[i] = 0;
        }
    }

    *count = 0;
    for (int i = 2; i <= limit; i++) {
        if (temp[i]) base[(*count)++] = i;
    }
}

void block_sieve(int rank, int size) {
    int start = 2 + rank * (N - 1) / size;
    int end = 1 + (rank + 1) * (N - 1) / size;

    int local_size = end - start + 1;
    int *marked = calloc(local_size, sizeof(int));

    int base[100], count;
    get_base_primes(base, &count);

    MPI_Barrier(MPI_COMM_WORLD);
    double t1 = MPI_Wtime();

    for (int i = 0; i < count; i++) {
        int p = base[i];
        int first = (start + p - 1) / p * p;
        if (first < p * p) first = p * p;

        for (int j = first; j <= end; j += p)
            marked[j - start] = 1;
    }

    MPI_Barrier(MPI_COMM_WORLD);
    double t2 = MPI_Wtime();

    if (rank == 0)
        printf("Block Time: %f\n", t2 - t1);

    free(marked);
}

void cyclic_sieve(int rank, int size) {
    int base[100], count;
    get_base_primes(base, &count);

    MPI_Barrier(MPI_COMM_WORLD);
    double t1 = MPI_Wtime();

    for (int num = 2 + rank; num <= N; num += size) {
        int composite = 0;
        for (int i = 0; i < count; i++) {
            int p = base[i];
            if (num == p) continue;
            if (num % p == 0) {
                composite = 1;
                break;
            }
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);
    double t2 = MPI_Wtime();

    if (rank == 0)
        printf("Cyclic Time: %f\n", t2 - t1);
}

int main(int argc, char *argv[]) {
    MPI_Init(&argc, &argv);

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    block_sieve(rank, size);
    cyclic_sieve(rank, size);

    MPI_Finalize();
    return 0;
}


mpicc sieve.c -o sieve -lm

mpirun -np 4 ./sieve


######################## 9. Implement Floyd’s version of All-Pair Shortest-Paths Problem through all steps of parallel algorithm design namely partitioning, communication, agglomeration and mapping using Parallel MPI programming ########################

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

#define N 8
#define INF 9999

static void print_matrix(int *a) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            if (a[i * N + j] >= INF) printf(" INF");
            else printf("%4d", a[i * N + j]);
        }
        printf("\n");
    }
}

static void block_info(int n, int p, int r, int *start, int *count) {
    int base = n / p, rem = n % p;
    *count = base + (r < rem);
    *start = r * base + (r < rem ? r : rem);
}

static int owner_of(int k, int n, int p) {
    int base = n / p, rem = n % p;
    if (k < (base + 1) * rem) return k / (base + 1);
    return rem + (k - (base + 1) * rem) / base;
}

int main(int argc, char **argv) {
    MPI_Init(&argc, &argv);

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    int *counts = malloc(size * sizeof(int));
    int *displs = malloc(size * sizeof(int));

    for (int r = 0; r < size; r++) {
        int start, rows;
        block_info(N, size, r, &start, &rows);
        counts[r] = rows * N;
        displs[r] = start * N;
    }

    int start_row, local_rows;
    block_info(N, size, rank, &start_row, &local_rows);

    int *local = malloc(local_rows * N * sizeof(int));
    int *matrix = NULL;

    if (rank == 0) {
        matrix = malloc(N * N * sizeof(int));
        srand(1);
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                if (i == j) matrix[i * N + j] = 0;
                else matrix[i * N + j] = (rand() % 5 == 0) ? INF : (rand() % 20 + 1);
            }
        }
        print_matrix(matrix);
        printf("\n");
    }

    MPI_Scatterv(matrix, counts, displs, MPI_INT,
                 local, local_rows * N, MPI_INT, 0, MPI_COMM_WORLD);

    int *rowk = malloc(N * sizeof(int));

    for (int k = 0; k < N; k++) {
        int owner = owner_of(k, N, size);

        if (rank == owner) {
            int lk = k - start_row;
            for (int j = 0; j < N; j++) rowk[j] = local[lk * N + j];
        }

        MPI_Bcast(rowk, N, MPI_INT, owner, MPI_COMM_WORLD);

        for (int i = 0; i < local_rows; i++) {
            if (local[i * N + k] >= INF) continue;
            for (int j = 0; j < N; j++) {
                if (rowk[j] >= INF) continue;
                int via = local[i * N + k] + rowk[j];
                if (via < local[i * N + j]) local[i * N + j] = via;
            }
        }
    }

    if (rank == 0) matrix = malloc(N * N * sizeof(int));

    MPI_Gatherv(local, local_rows * N, MPI_INT,
                matrix, counts, displs, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        print_matrix(matrix);
    }

    free(rowk);
    free(local);
    free(counts);
    free(displs);
    if (rank == 0) free(matrix);

    MPI_Finalize();
    return 0;
}


mpicc -O2 -o floyd_mpi floyd_mpi.c
mpirun -np 4 ./floyd_mpi

mpirun --allow-run-as-root -np 4 ./floyd_mpi


#################### 10. Write a CUDA C program to find Matrix Transpose and Multiply with a Vector ################

// transpose_matvec.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 4

__global__ void transposeKernel(const float *in, float *out, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < n) {
        out[col * n + row] = in[row * n + col];
    }
}

__global__ void matVecKernel(const float *mat, const float *vec, float *res, int n) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n) {
        float sum = 0.0f;
        for (int j = 0; j < n; j++) {
            sum += mat[row * n + j] * vec[j];
        }
        res[row] = sum;
    }
}

void printMatrix(const float *m, int n) {
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            printf("%6.1f ", m[i * n + j]);
        }
        printf("\n");
    }
}

void printVector(const float *v, int n) {
    for (int i = 0; i < n; i++) {
        printf("%6.1f ", v[i]);
    }
    printf("\n");
}

int main() {
    float h_mat[N * N] = {
        1,  2,  3,  4,
        5,  6,  7,  8,
        9, 10, 11, 12,
       13, 14, 15, 16
    };

    float h_vec[N] = {1, 2, 3, 4};
    float h_trans[N * N];
    float h_res[N];

    float *d_mat, *d_vec, *d_trans, *d_res;

    cudaMalloc((void**)&d_mat, N * N * sizeof(float));
    cudaMalloc((void**)&d_vec, N * sizeof(float));
    cudaMalloc((void**)&d_trans, N * N * sizeof(float));
    cudaMalloc((void**)&d_res, N * sizeof(float));

    cudaMemcpy(d_mat, h_mat, N * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_vec, h_vec, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 block2D(16, 16);
    dim3 grid2D((N + block2D.x - 1) / block2D.x, (N + block2D.y - 1) / block2D.y);
    transposeKernel<<<grid2D, block2D>>>(d_mat, d_trans, N);

    int block1D = 256;
    int grid1D = (N + block1D - 1) / block1D;
    matVecKernel<<<grid1D, block1D>>>(d_mat, d_vec, d_res, N);

    cudaMemcpy(h_trans, d_trans, N * N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_res, d_res, N * sizeof(float), cudaMemcpyDeviceToHost);

    printf("Matrix A:\n");
    printMatrix(h_mat, N);

    printf("\nTranspose of A:\n");
    printMatrix(h_trans, N);

    printf("\nVector x:\n");
    printVector(h_vec, N);

    printf("\nA * x:\n");
    printVector(h_res, N);

    cudaFree(d_mat);
    cudaFree(d_vec);
    cudaFree(d_trans);
    cudaFree(d_res);

    return 0;
}




%%writefile transpose_matvec.cu
// paste the full code here



!nvcc -arch=sm_75 transpose_matvec.cu -o transpose_matvec
!./transpose_matvec


##########################33 Implement N Body Problem using Fosters Design method #######################333

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <omp.h>

#define G 1.0
#define DT 0.001
#define SOFTENING 1e-9
#define COLLISION_DIST 0.01

static double rand_range(double a, double b) {
    return a + (b - a) * ((double)rand() / (double)RAND_MAX);
}

int main(void) {
    int n = 100;
    int steps = 10000;
    double collision_dist2 = COLLISION_DIST * COLLISION_DIST;
    long long collision_count = 0;

    srand((unsigned)time(NULL));

    double *x = malloc(n * sizeof(double));
    double *y = malloc(n * sizeof(double));
    double *z = malloc(n * sizeof(double));
    double *vx = malloc(n * sizeof(double));
    double *vy = malloc(n * sizeof(double));
    double *vz = malloc(n * sizeof(double));
    double *m  = malloc(n * sizeof(double));
    double *ax = malloc(n * sizeof(double));
    double *ay = malloc(n * sizeof(double));
    double *az = malloc(n * sizeof(double));

    if (!x || !y || !z || !vx || !vy || !vz || !m || !ax || !ay || !az) {
        printf("Memory allocation failed\n");
        return 1;
    }

    for (int i = 0; i < n; i++) {
        x[i] = rand_range(-1.0, 1.0);
        y[i] = rand_range(-1.0, 1.0);
        z[i] = rand_range(-1.0, 1.0);

        vx[i] = rand_range(-0.01, 0.01);
        vy[i] = rand_range(-0.01, 0.01);
        vz[i] = rand_range(-0.01, 0.01);

        m[i] = rand_range(0.5, 5.0);
    }

    for (int step = 0; step < steps; step++) {
        #pragma omp parallel for
        for (int i = 0; i < n; i++) {
            double a_x = 0.0, a_y = 0.0, a_z = 0.0;

            for (int j = 0; j < n; j++) {
                if (i == j) continue;

                double dx = x[j] - x[i];
                double dy = y[j] - y[i];
                double dz = z[j] - z[i];

                double dist2 = dx * dx + dy * dy + dz * dz + SOFTENING;
                double invDist = 1.0 / sqrt(dist2);
                double invDist3 = invDist * invDist * invDist;
                double f = G * m[j] * invDist3;

                a_x += dx * f;
                a_y += dy * f;
                a_z += dz * f;
            }

            ax[i] = a_x;
            ay[i] = a_y;
            az[i] = a_z;
        }

        #pragma omp parallel for
        for (int i = 0; i < n; i++) {
            vx[i] += ax[i] * DT;
            vy[i] += ay[i] * DT;
            vz[i] += az[i] * DT;

            x[i] += vx[i] * DT;
            y[i] += vy[i] * DT;
            z[i] += vz[i] * DT;
        }

        #pragma omp parallel for reduction(+:collision_count)
        for (int i = 0; i < n; i++) {
            for (int j = i + 1; j < n; j++) {
                double dx = x[j] - x[i];
                double dy = y[j] - y[i];
                double dz = z[j] - z[i];

                double dist2 = dx * dx + dy * dy + dz * dz;

                if (dist2 < collision_dist2) {
                    double cx = 0.5 * (x[i] + x[j]);
                    double cy = 0.5 * (y[i] + y[j]);
                    double cz = 0.5 * (z[i] + z[j]);

                    #pragma omp critical
                    {
                        printf("Collision: body %d and body %d at step %d, t=%.3f, coordinate=(%.6f, %.6f, %.6f)\n",
                               i, j, step, step * DT, cx, cy, cz);
                    }

                    collision_count++;
                }
            }
        }
    }

    printf("Bodies: %d\n", n);
    printf("Steps: %d\n", steps);
    printf("Total collisions: %lld\n", collision_count);

    free(x);
    free(y);
    free(z);
    free(vx);
    free(vy);
    free(vz);
    free(m);
    free(ax);
    free(ay);
    free(az);

    return 0;
}




gcc -fopenmp nbody.c -o nbody -lm
export OMP_NUM_THREADS=4
./nbody