Skip to content

Commit

Permalink
Merge pull request #228 from chdb-io/replaceIcuWithUtf8proc
Browse files Browse the repository at this point in the history
Replace icu with utf8proc
  • Loading branch information
auxten authored Jun 22, 2024
2 parents 765d869 + 8ce6969 commit 50370a6
Show file tree
Hide file tree
Showing 23 changed files with 246 additions and 170 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/build_arm_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ jobs:
- name: Restore submodules cache
run: |
cp -a /builder_cache/contrib ./
- name: remove old clang and link clang-17 to clang
- name: remove old clang and link clang-18 to clang
if: matrix.os == 'ubuntu-22.04'
run: |
sudo rm -f /usr/bin/clang || true
sudo ln -s /usr/bin/clang-17 /usr/bin/clang
sudo ln -s /usr/bin/clang-18 /usr/bin/clang
sudo rm -f /usr/bin/clang++ || true
sudo ln -s /usr/bin/clang++-17 /usr/bin/clang++
sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++
which clang++
clang++ --version
- name: Make linux-arm64
Expand All @@ -65,11 +65,11 @@ jobs:
eval "$(pyenv init -)"
pyenv local "${{ matrix.python-version }}"
python3 -m pip install auditwheel
auditwheel -v repair -w dist/ --plat manylinux_2_17_aarch64 dist/*.whl
auditwheel -v repair -w dist/ --plat manylinux_2_18_aarch64 dist/*.whl
continue-on-error: false
- name: Show files
run: |
# e.g: remove chdb-0.11.4-cp310-cp310-linux_aarch64.whl, keep chdb-0.11.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
# e.g: remove chdb-0.11.4-cp310-cp310-linux_aarch64.whl, keep chdb-0.11.4-cp310-cp310-manylinux_2_18_aarch64.manylinux2014_aarch64.whl
sudo rm -f dist/*linux_aarch64.whl
ls -lh dist
shell: bash
Expand Down
14 changes: 7 additions & 7 deletions .github/workflows/build_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ jobs:
uname -a
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
sudo ./llvm.sh 17
which clang++-17
clang++-17 --version
sudo ./llvm.sh 18
which clang++-18
clang++-18 --version
sudo apt-get install -y make cmake ccache ninja-build yasm gawk wget
ccache -s
- name: Update git
Expand Down Expand Up @@ -85,13 +85,13 @@ jobs:
key: ${{ matrix.os }}
max-size: 5G
append-timestamp: true
- name: remove old clang and link clang-17 to clang
- name: remove old clang and link clang-18 to clang
if: matrix.os == 'ubuntu-20.04'
run: |
sudo rm -f /usr/bin/clang || true
sudo ln -s /usr/bin/clang-17 /usr/bin/clang
sudo ln -s /usr/bin/clang-18 /usr/bin/clang
sudo rm -f /usr/bin/clang++ || true
sudo ln -s /usr/bin/clang++-17 /usr/bin/clang++
sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++
which clang++
clang++ --version
- name: Run chdb/build.sh
Expand Down Expand Up @@ -120,7 +120,7 @@ jobs:
make wheel
- name: Install patchelf from github
run: |
wget https://github.com/NixOS/patchelf/releases/download/0.17.2/patchelf-0.17.2-x86_64.tar.gz -O patchelf.tar.gz
wget https://github.com/NixOS/patchelf/releases/download/0.18.2/patchelf-0.18.2-x86_64.tar.gz -O patchelf.tar.gz
tar -xvf patchelf.tar.gz
sudo cp bin/patchelf /usr/bin/
sudo chmod +x /usr/bin/patchelf
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -372,3 +372,6 @@
[submodule "contrib/arrow"]
path = contrib/arrow
url = https://github.com/auxten/arrow
[submodule "contrib/utf8proc"]
path = contrib/utf8proc
url = https://github.com/JuliaStrings/utf8proc.git
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ endif()

if (ENABLE_PYTHON)
set(USE_PYTHON 1)
set(USE_UTF8PROC 1)
endif()

# Global libraries
Expand Down
68 changes: 48 additions & 20 deletions chdb/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ CMAKE_ARGS="-DCMAKE_BUILD_TYPE=${build_type} -DENABLE_THINLTO=0 -DENABLE_TESTS=0
-DENABLE_LIBRARIES=0 -DENABLE_RUST=0 \
${GLIBC_COMPATIBILITY} \
-DENABLE_UTILS=0 ${LLVM} ${UNWIND} \
${ICU} ${JEMALLOC} \
${ICU} -DENABLE_UTF8PROC=1 ${JEMALLOC} \
-DENABLE_PARQUET=1 -DENABLE_ROCKSDB=1 -DENABLE_SQLITE=1 -DENABLE_VECTORSCAN=1 \
-DENABLE_PROTOBUF=1 -DENABLE_THRIFT=1 \
-DENABLE_RAPIDJSON=1 \
Expand Down Expand Up @@ -161,12 +161,7 @@ LIBCHDB_SO="libchdb.so"
# Build libchdb.so
cmake ${CMAKE_ARGS} -DENABLE_PYTHON=0 ..
ninja -d keeprsp
if [ ! -f CMakeFiles/clickhouse.rsp ]; then
echo "CMakeFiles/clickhouse.rsp not found"
exit 1
fi

cp -a CMakeFiles/clickhouse.rsp CMakeFiles/libchdb.rsp

BINARY=${BUILD_DIR}/programs/clickhouse
echo -e "\nBINARY: ${BINARY}"
Expand All @@ -175,6 +170,18 @@ echo -e "\nldd ${BINARY}"
${LDD} ${BINARY}
rm -f ${BINARY}

cd ${BUILD_DIR}
ninja -d keeprsp -v > build.log || true
USING_RESPONSE_FILE=$(grep -m 1 'clang++.*-o programs/clickhouse .*' build.log | grep '@CMakeFiles/clickhouse.rsp' || true)

if [ ! "${USING_RESPONSE_FILE}" == "" ]; then
if [ -f CMakeFiles/clickhouse.rsp ]; then
cp -a CMakeFiles/clickhouse.rsp CMakeFiles/libchdb.rsp
else
echo "CMakeFiles/clickhouse.rsp not found"
exit 1
fi
fi

LIBCHDB_CMD=$(grep -m 1 'clang++.*-o programs/clickhouse .*' build.log \
| sed "s/-o programs\/clickhouse/-fPIC -shared -o ${LIBCHDB_SO}/" \
Expand All @@ -186,16 +193,21 @@ LIBCHDB_CMD=$(grep -m 1 'clang++.*-o programs/clickhouse .*' build.log \

# generate the command to generate libchdb.so
LIBCHDB_CMD=$(echo ${LIBCHDB_CMD} | sed 's/ '${CHDB_PY_MODULE}'/ '${LIBCHDB_SO}'/g')
${SED_INPLACE} 's/ '${CHDB_PY_MODULE}'/ '${LIBCHDB_SO}'/g' CMakeFiles/libchdb.rsp

if [ ! "${USING_RESPONSE_FILE}" == "" ]; then
${SED_INPLACE} 's/ '${CHDB_PY_MODULE}'/ '${LIBCHDB_SO}'/g' CMakeFiles/libchdb.rsp
fi

if [ "$(uname)" == "Linux" ]; then
LIBCHDB_CMD=$(echo ${LIBCHDB_CMD} | sed 's/ '${PYINIT_ENTRY}'/ /g')
${SED_INPLACE} 's/ '${PYINIT_ENTRY}'/ /g' CMakeFiles/libchdb.rsp
if [ ! "${USING_RESPONSE_FILE}" == "" ]; then
${SED_INPLACE} 's/ '${PYINIT_ENTRY}'/ /g' CMakeFiles/libchdb.rsp
fi
fi

if [ "$(uname)" == "Darwin" ]; then
LIBCHDB_CMD=$(echo ${LIBCHDB_CMD} | sed 's/ '${PYINIT_ENTRY}'/ -Wl,-exported_symbol,_query_stable -Wl,-exported_symbol,_free_result -Wl,-exported_symbol,_query_stable_v2 -Wl,-exported_symbol,_free_result_v2/g')
${SED_INPLACE} 's/ '${PYINIT_ENTRY}'/ -Wl,-exported_symbol,_query_stable -Wl,-exported_symbol,_free_result -Wl,-exported_symbol,_query_stable_v2 -Wl,-exported_symbol,_free_result_v2/g' CMakeFiles/libchdb.rsp
# ${SED_INPLACE} 's/ '${PYINIT_ENTRY}'/ -Wl,-exported_symbol,_query_stable -Wl,-exported_symbol,_free_result -Wl,-exported_symbol,_query_stable_v2 -Wl,-exported_symbol,_free_result_v2/g' CMakeFiles/libchdb.rsp
fi

LIBCHDB_CMD=$(echo ${LIBCHDB_CMD} | sed 's/@CMakeFiles\/clickhouse.rsp/@CMakeFiles\/libchdb.rsp/g')
Expand All @@ -220,12 +232,16 @@ ninja -d keeprsp || true
cd ${BUILD_DIR}
ninja -d keeprsp -v > build.log || true

if [ ! -f CMakeFiles/clickhouse.rsp ]; then
echo "CMakeFiles/clickhouse.rsp not found"
exit 1
fi
USING_RESPONSE_FILE=$(grep -m 1 'clang++.*-o programs/clickhouse .*' build.log | grep '@CMakeFiles/clickhouse.rsp' || true)

cp -a CMakeFiles/clickhouse.rsp CMakeFiles/pychdb.rsp
if [ ! "${USING_RESPONSE_FILE}" == "" ]; then
if [ -f CMakeFiles/clickhouse.rsp ]; then
cp -a CMakeFiles/clickhouse.rsp CMakeFiles/pychdb.rsp
else
echo "CMakeFiles/clickhouse.rsp not found"
exit 1
fi
fi

# extract the command to generate CHDB_PY_MODULE
PYCHDB_CMD=$(grep -m 1 'clang++.*-o programs/clickhouse .*' build.log \
Expand All @@ -237,19 +253,21 @@ PYCHDB_CMD=$(grep -m 1 'clang++.*-o programs/clickhouse .*' build.log \
)


# inplace modify the CMakeFiles/pychdb.rsp
${SED_INPLACE} 's/-o programs\/clickhouse/-fPIC -Wl,-undefined,dynamic_lookup -shared ${PYINIT_ENTRY} -o ${CHDB_PY_MODULE}/' CMakeFiles/pychdb.rsp
${SED_INPLACE} 's/ -Wl,-undefined,error/ -Wl,-undefined,dynamic_lookup/g' CMakeFiles/pychdb.rsp
${SED_INPLACE} 's/ -Xlinker --no-undefined//g' CMakeFiles/pychdb.rsp
# # inplace modify the CMakeFiles/pychdb.rsp
# ${SED_INPLACE} 's/-o programs\/clickhouse/-fPIC -Wl,-undefined,dynamic_lookup -shared ${PYINIT_ENTRY} -o ${CHDB_PY_MODULE}/' CMakeFiles/pychdb.rsp
# ${SED_INPLACE} 's/ -Wl,-undefined,error/ -Wl,-undefined,dynamic_lookup/g' CMakeFiles/pychdb.rsp
# ${SED_INPLACE} 's/ -Xlinker --no-undefined//g' CMakeFiles/pychdb.rsp


if [ "$(uname)" == "Linux" ]; then
# remove src/CMakeFiles/clickhouse_malloc.dir/Common/stubFree.c.o
PYCHDB_CMD=$(echo ${PYCHDB_CMD} | sed 's/ src\/CMakeFiles\/clickhouse_malloc.dir\/Common\/stubFree.c.o//g')
${SED_INPLACE} 's/ src\/CMakeFiles\/clickhouse_malloc.dir\/Common\/stubFree.c.o//g' CMakeFiles/pychdb.rsp
# put -Wl,-wrap,malloc ... after -DUSE_JEMALLOC=1
PYCHDB_CMD=$(echo ${PYCHDB_CMD} | sed 's/ -DUSE_JEMALLOC=1/ -DUSE_JEMALLOC=1 -Wl,-wrap,malloc -Wl,-wrap,valloc -Wl,-wrap,pvalloc -Wl,-wrap,calloc -Wl,-wrap,realloc -Wl,-wrap,memalign -Wl,-wrap,aligned_alloc -Wl,-wrap,posix_memalign -Wl,-wrap,free/g')
${SED_INPLACE} 's/ -DUSE_JEMALLOC=1/ -DUSE_JEMALLOC=1 -Wl,-wrap,malloc -Wl,-wrap,valloc -Wl,-wrap,pvalloc -Wl,-wrap,calloc -Wl,-wrap,realloc -Wl,-wrap,memalign -Wl,-wrap,aligned_alloc -Wl,-wrap,posix_memalign -Wl,-wrap,free/g' CMakeFiles/pychdb.rsp
if [ ! "${USING_RESPONSE_FILE}" == "" ]; then
${SED_INPLACE} 's/ src\/CMakeFiles\/clickhouse_malloc.dir\/Common\/stubFree.c.o//g' CMakeFiles/pychdb.rsp
${SED_INPLACE} 's/ -DUSE_JEMALLOC=1/ -DUSE_JEMALLOC=1 -Wl,-wrap,malloc -Wl,-wrap,valloc -Wl,-wrap,pvalloc -Wl,-wrap,calloc -Wl,-wrap,realloc -Wl,-wrap,memalign -Wl,-wrap,aligned_alloc -Wl,-wrap,posix_memalign -Wl,-wrap,free/g' CMakeFiles/pychdb.rsp
fi
fi

# save the command to a file for debug
Expand All @@ -264,6 +282,16 @@ LIBCHDB_DIR=${BUILD_DIR}/

PYCHDB=${LIBCHDB_DIR}/${CHDB_PY_MODULE}
LIBCHDB=${LIBCHDB_DIR}/${LIBCHDB_SO}

if [ ${build_type} == "Debug" ]; then
echo -e "\nDebug build, skip strip"
else
echo -e "\nStrip the binary:"
${STRIP} --strip-debug --remove-section=.comment --remove-section=.note ${PYCHDB}
${STRIP} --strip-debug --remove-section=.comment --remove-section=.note ${LIBCHDB}
fi
echo -e "\nStripe the binary:"

echo -e "\nPYCHDB: ${PYCHDB}"
ls -lh ${PYCHDB}
echo -e "\nLIBCHDB: ${LIBCHDB}"
Expand Down
24 changes: 24 additions & 0 deletions chdb/vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,30 @@ pushd ${PROJ_DIR}
CHDB_VERSION=$(python3 -c 'import setup; print(setup.get_latest_git_tag())')
popd

# try to use largest llvm-strip version
# if none of them are found, use llvm-strip or strip
if [ -z "$STRIP" ]; then
STRIP=$(ls -1 /usr/bin/llvm-strip* | sort -V | tail -n 1)
fi
if [ -z "$STRIP" ]; then
STRIP=$(ls -1 /usr/local/bin/llvm-strip* | sort -V | tail -n 1)
fi
# on macOS
if [ -z "$STRIP" ]; then
STRIP=$(ls -1 /usr/local/Cellar/llvm/*/bin/llvm-strip* | sort -V | tail -n 1)
fi
if [ -z "$STRIP" ]; then
STRIP=$(ls -1 /usr/local/opt/llvm/bin/llvm-strip* | sort -V | tail -n 1)
fi

# if none of them are found, use llvm-strip or strip
if [ -z "$STRIP" ]; then
STRIP=$(which llvm-strip)
fi
if [ -z "$STRIP" ]; then
STRIP=$(which strip)
fi

# check current os type, and make ldd command
if [ "$(uname)" == "Darwin" ]; then
LDD="otool -L"
Expand Down
1 change: 1 addition & 0 deletions contrib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ add_contrib (wyhash-cmake wyhash)
add_contrib (cityhash102)
add_contrib (libfarmhash)
add_contrib (icu-cmake icu)
add_contrib (utf8proc-cmake utf8proc)
add_contrib (h3-cmake h3)
add_contrib (mariadb-connector-c-cmake mariadb-connector-c)
add_contrib (libfiu-cmake libfiu)
Expand Down
2 changes: 1 addition & 1 deletion contrib/cassandra-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
option(ENABLE_CASSANDRA "Enable Cassandra" ${ENABLE_LIBRARIES})
option(ENABLE_CASSANDRA "Enable Cassandra" 0)

if (NOT ENABLE_CASSANDRA)
message(STATUS "Not using cassandra")
Expand Down
1 change: 1 addition & 0 deletions contrib/utf8proc
Submodule utf8proc added at dce381
17 changes: 17 additions & 0 deletions contrib/utf8proc-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
option(ENABLE_UTF8PROC "Enable UTF8PROC" 1)
if (NOT ENABLE_UTF8PROC)
message(STATUS "Not using utf8proc")
return()
endif()

set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/utf8proc/")
set(UTF8PROC_INCLUDE_DIR "${LIBRARY_DIR}" CACHE STRING "Path to utf8proc")
message(STATUS "Using utf8proc from ${LIBRARY_DIR}")

set(SRCS
"${LIBRARY_DIR}/utf8proc.c"
)

add_library(utf8proc ${SRCS})
add_library(ch_contrib::utf8proc ALIAS utf8proc)
target_include_directories(utf8proc PRIVATE "${LIBRARY_DIR}")
17 changes: 3 additions & 14 deletions programs/local/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,6 @@ if (USE_PYTHON)
endif()
endif()

# add_library(clickhouse-local-lib SHARED ${CLICKHOUSE_LOCAL_SOURCES})

# target_link_libraries(clickhouse-local-lib
# PRIVATE
# boost::program_options
# clickhouse_aggregate_functions
# clickhouse_common_config
# clickhouse_common_io
# clickhouse_functions
# clickhouse_parsers
# clickhouse_storages_system
# clickhouse_table_functions
# )

set (CLICKHOUSE_LOCAL_LINK
PRIVATE
boost::program_options
Expand All @@ -83,6 +69,9 @@ endif()
if (TARGET ch_contrib::azure_sdk)
target_link_libraries(clickhouse-local-lib PRIVATE ch_contrib::azure_sdk)
endif()
if (TARGET ch_contrib::utf8proc)
target_link_libraries(clickhouse-local-lib PRIVATE ch_contrib::utf8proc)
endif()

# Always use internal readpassphrase
target_link_libraries(clickhouse-local-lib PRIVATE readpassphrase)
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,12 @@ if (USE_ORC)
dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ORC_INCLUDE_DIR} "${PROJECT_BINARY_DIR}/contrib/orc/c++/include")
endif ()

if (USE_UTF8PROC)
dbms_target_link_libraries(PUBLIC ch_contrib::utf8proc)
target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${UTF8PROC_INCLUDE_DIR})
message(STATUS "UTF8PROC_INCLUDE_DIR: ${UTF8PROC_INCLUDE_DIR}")
endif()

if (TARGET ch_contrib::rocksdb)
dbms_target_link_libraries(PUBLIC ch_contrib::rocksdb)
endif()
Expand Down
Loading

0 comments on commit 50370a6

Please sign in to comment.