Skip to content

Commit

Permalink
Merge pull request #206 from martin-g/support-linux-aarch64
Browse files Browse the repository at this point in the history
Update sse2neon.h to latest for better support for Linux ARM64
  • Loading branch information
msuchard authored Oct 13, 2023
2 parents ba16048 + 4b77564 commit 6d909c2
Show file tree
Hide file tree
Showing 5 changed files with 7,632 additions and 4,677 deletions.
59 changes: 59 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: CI

on:
push:
branches:
- master
pull_request:
branches:
- master

jobs:
build:
name: Build on Linux x86_64
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y cmake file
- name: Build
run: |
mkdir build
cd build
cmake ..
make -j
file libhmsbeagle/libhmsbeagle.so.* | grep x86-64
build-aarch64:
name: Build on Linux aarch64
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Build
uses: uraimo/run-on-arch-action@v2
with:
arch: aarch64
distro: ubuntu20.04
githubToken: ${{ github.token }}
dockerRunArgs: |
--volume "${PWD}:/beagle-lib"
install: |
apt-get update -q -y
apt-get install -q -y cmake gcc g++ openjdk-11-jdk file
run: |
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-arm64
cd /beagle-lib
mkdir build
cd build
cmake ..
make -j
file libhmsbeagle/libhmsbeagle.so.* | grep aarch64
2 changes: 1 addition & 1 deletion libhmsbeagle/CPU/BeagleCPUPlugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ BeagleCPUPlugin::BeagleCPUPlugin() :
Plugin("CPU", "CPU")
{
BeagleResource resource;
#ifdef __ARM64_ARCH_8__
#ifdef __aarch64__
resource.name = (char*) "CPU (arm64)";
#else
resource.name = (char*) "CPU (x86_64)";
Expand Down
8 changes: 5 additions & 3 deletions libhmsbeagle/CPU/BeagleCPUSSEPlugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ BeagleCPUSSEPlugin::BeagleCPUSSEPlugin() :
Plugin("CPU-SSE", "CPU-SSE")
{
BeagleResource resource;
#ifdef __ARM64_ARCH_8__
#ifdef __aarch64__
resource.name = (char*) "CPU (arm64)";
#else
resource.name = (char*) "CPU (x86_64)";
Expand Down Expand Up @@ -116,6 +116,8 @@ bool check_sse2()
if (edx & bit_SSE2)
return true;
return false;
#elif defined(__aarch64__)
return false;
#else // HAVE_CPUID.H
// Determine if cpuid supported:
unsigned int res;
Expand Down Expand Up @@ -158,8 +160,8 @@ bool check_sse2()
return result[3] & 0x04000000;
#endif // HAVE_CPUID.H
}
#else // For Mac OS X GNU C
#if defined(__ARM64_ARCH_8__)
#else
#if defined(__aarch64__)
bool check_sse2() { return 1; }
#else
bool check_sse2(){
Expand Down
30 changes: 1 addition & 29 deletions libhmsbeagle/CPU/SSEDefinitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,40 +23,12 @@
#define DLS_USE_SSE2

#if defined(DLS_USE_SSE2)
# if defined(__ARM64_ARCH_8__)
# if defined(__aarch64__)
# include "libhmsbeagle/CPU/sse2neon.h"
# define _MM_SHUFFLE2(fp1,fp0) (((fp1) << 1) | (fp0))
# define VEC_SHUFFLE0(a,b) _mm_shuffle_pd(a, b, _MM_SHUFFLE2(0,0)) // vreinterpretq_f64_m128d(a)
# define VEC_SHUFFLE1(a,b) _mm_shuffle_pd(a, b, _MM_SHUFFLE2(1,1)) // vreinterpretq_f64_m128d(a)
# if __has_builtin(__builtin_shufflevector)
# define _mm_shuffle_pd(a,b,imm) \
__extension__({ \
float64x2_t _input1 = vreinterpretq_f64_m128(a); \
float64x2_t _input2 = vreinterpretq_f64_m128(b); \
float64x2_t _shuf = __builtin_shufflevector( \
_input1, _input2, (imm) & (0x1), ((imm) >> 1) & 0x1); \
vreinterpretq_m128_f32(_shuf); \
})
# else
# error "Need to implement NEON translation of _mm_shuffle_pd"
# endif

static inline __m128 _mm_div_pd(__m128 a, __m128 b) {
return vreinterpretq_m128_f64(
vdivq_f64(vreinterpretq_f64_m128(a), vreinterpretq_f64_m128(b)));
}

static inline void _mm_store_sd(double* a, __m128 b) {
const auto _b = vreinterpretq_f64_m128(b);
a[0] = _b[0];
}

static inline __m128 _mm_dp_pd(__m128 lhs, __m128 rhs, const int) {
auto const product = vmulq_f64(vreinterpretq_f64_m128d(lhs),
vreinterpretq_f64_m128d(rhs));
auto const sum = product[0] + product[1]; // TODO Almost surely an more efficient way
return vreinterpretq_m128d_f64(vdupq_n_f64(sum));
}
# else
# if !defined(DLS_MACOS)
# include <emmintrin.h>
Expand Down
Loading

0 comments on commit 6d909c2

Please sign in to comment.