diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..af04286
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1 @@
+github: FrancescAlted
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d7b0afc..8e88f52 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,37 +5,51 @@ on: [push, pull_request]
 permissions:
   contents: read
 
-env:
-  CIBW_BEFORE_BUILD: pip install setuptools oldest-supported-numpy
-  CIBW_BUILD_VERBOSITY: 1
-  CIBW_TEST_COMMAND: python -c "import sys, numexpr; sys.exit(0 if numexpr.test().wasSuccessful() else 1)"
-  CIBW_TEST_SKIP: "*macosx*arm64*"
-  # Building for musllinux and aarch64 takes way too much time.
-  # NumPy is adding musllinux for just x86_64 too, so this is not too bad.
-  CIBW_SKIP: "*musllinux*aarch64*"
-
 jobs:
   build_wheels:
-    name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }} - ${{ matrix.p_ver }}
-    runs-on: ${{ matrix.os }}
+    name: Build and test on ${{ matrix.os }}${{ matrix.numpy-version && format(' (numpy {0})', matrix.numpy-version) || '' }} for ${{ matrix.arch }}
+    runs-on: ${{ matrix.runs-on || matrix.os }}
     permissions:
       contents: write
     env:
-      CIBW_BUILD: ${{ matrix.cibw_build }}
       CIBW_ARCHS_LINUX: ${{ matrix.arch }}
       CIBW_ARCHS_MACOS: "x86_64 arm64"
+      CIBW_ENABLE: cpython-freethreading
+
     strategy:
+      fail-fast: false
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
-        arch: [x86_64, aarch64]
-        cibw_build: ["cp3{9,10,11,12,13}-*"]
-        p_ver: ["3.9-3.13"]
-        exclude:
-          - os: windows-latest
+        include:
+          # Linux x86_64 (build wheels)
+          - os: ubuntu-latest
+            arch: x86_64
+            artifact_name: "linux-x86_64"
+            python-version: "3.x"
+
+          # Linux x86_64 (test numpy 1.26)
+          - os: ubuntu-latest
+            arch: x86_64
+            artifact_name: "linux-x86_64_numpy1_26"
+            python-version: "3.12"
+            numpy-version: "1.26"
+
+          # Linux ARM64 (build wheels)
+          - os: ubuntu-24.04-arm
             arch: aarch64
-          # cibuild is already in charge to build aarch64 (see CIBW_ARCHS_MACOS)
+            artifact_name: "linux-aarch64"
+            python-version: "3.x"
+
+          # Windows (build wheels)
+          - os: windows-latest
+            arch: x86_64
+            artifact_name: "windows-x86_64"
+            python-version: "3.x"
+
+          # macOS (build wheels)
           - os: macos-latest
-            arch: aarch64
+            arch: x86_64
+            artifact_name: "macos-universal2"
+            python-version: "3.x"
 
     steps:
       - uses: actions/checkout@v3
@@ -43,32 +57,38 @@ jobs:
       - uses: actions/setup-python@v3
         name: Install Python
         with:
-          python-version: '3.x'
+          python-version: ${{ matrix.python-version }}
 
-      - name: Install cibuildwheel
+      # Run tests with specific numpy version
+      - name: Install and test with specific numpy version
+        if: matrix.numpy-version
         run: |
-          python -m pip install cibuildwheel
-
-      - uses: docker/setup-qemu-action@v2
-        if: ${{ matrix.arch == 'aarch64' }}
-        name: Set up QEMU
+          pip install "numpy==${{ matrix.numpy-version }}.*"
+          pip install -e .
+          pip install pytest
+          python -m pytest
 
+      # Build wheels only if:
+      #   - No numpy version is specified
+      #   - Python version is "3.x"
       - name: Build wheels
-        run: |
-          python -m cibuildwheel --output-dir wheelhouse
+        if: ${{ !matrix.numpy-version }}
+        uses: pypa/cibuildwheel@v3.1.3
 
       - name: Make sdist
-        if: ${{ matrix.os == 'windows-latest' }}
+        if: ${{ matrix.os == 'windows-latest' && !matrix.numpy-version }}
         run: |
           python -m pip install build
           python -m build --sdist --outdir wheelhouse .
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
+        if: ${{ !matrix.numpy-version }}
         with:
+          name: ${{ matrix.artifact_name }}
           path: ./wheelhouse/*
 
       - name: Upload to GitHub Release
+        if: startsWith(github.ref, 'refs/tags/') && !matrix.numpy-version
         uses: softprops/action-gh-release@v1
-        if: startsWith(github.ref, 'refs/tags/')
         with:
           files: wheelhouse/*
diff --git a/.gitignore b/.gitignore
index 928bf15..7bf6f98 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ artifact/
 numexpr.egg-info/
 *.pyc
 *.swp
+*.so
 *~
 doc/_build
 site.cfg
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..8b00c7e
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: debug-statements
+
+# Too many things to fix, let's just ignore it for now
+#-   repo: https://github.com/pycqa/flake8
+#    rev: 7.0.0
+#    hooks:
+#    -   id: flake8
+#
+-   repo: https://github.com/pycqa/isort
+    rev: 7.0.0
+    hooks:
+    -   id: isort
+
+# Too many things to fix, let's just ignore it for now
+#-   repo: https://github.com/pre-commit/mirrors-mypy
+#    rev: v1.8.0
+#    hooks:
+#    -   id: mypy
+#        exclude: ^(docs/|setup.py)
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index d2c3d13..3803a41 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -19,4 +19,4 @@ sphinx:
 # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
   install:
-  - requirements: doc/requirements.txt
\ No newline at end of file
+  - requirements: doc/requirements.txt
diff --git a/ADDFUNCS.rst b/ADDFUNCS.rst
new file mode 100644
index 0000000..497c6fe
--- /dev/null
+++ b/ADDFUNCS.rst
@@ -0,0 +1,241 @@
+Functions and Function signatures
+=================================
+
+Adding functions
+----------------
+
+In order to add new functions to ``numexpr``, currently it is necessary to edit several files. Consider adding a function
+``out_type myfunc(arg_type)``.
+
+* ``numexpr/expressions.py``
+Add ``'myfunc': func(numpy.myfunc, out_dtype),`` to the dict of functions, ``functions = {...``. If the return type of the function is ``bool``, add
+the function to the list ``if opcode in ("isnan", "isfinite"):`` in the ``__init__`` function of the ``FuncNode`` class.
+In the future it might be nice to refactor this function since it sets the output type based on the type of the inputs in general.
+
+* ``numexpr/necompiler.py``
+Add ``"myfunc"`` to the list of functions:
+
+.. code-block:: python3
+
+    "floor",
+    "isnan",
+    "isfinite",
+    "myfunc"
+    ]
+
+* ``numexpr/functions.hpp``
+Find the correct function signature ``FUNC_OA`` where ``O`` is the return type, and ``A`` the argument type(s). For example, if the function
+is ``double myfunc(double)``, one should edit within the ``FUNC_DD`` clause. If you cannot find your function signature you will have to add it,
+following the template of the other functions.
+Most likely, you will want to add support for several function signatures (e.g. double -> bool and float -> bool) and so you will have to add the
+function in two clauses. If your function has a float input, you will see that there are 5 arguments in the
+``FUNC_OA`` macro, and you will have to add ``myfunc2`` here is order to compile on MSVC machines (i.e. Windows, see following).
+Example:
+
+.. code-block:: cpp
+   :emphasize-lines: 6, 20
+
+    #ifndef FUNC_DD
+    #define ELIDE_FUNC_DD
+    #define FUNC_DD(...)
+    #endif
+    ...
+    FUNC_DD(FUNC_MYFUNC_DD, "myfunc_dd", myfunc, vdMyfunc)
+    FUNC_DD(FUNC_DD_LAST,    NULL,          NULL,  NULL)
+    #ifdef ELIDE_FUNC_DD
+    #undef ELIDE_FUNC_DD
+    #undef FUNC_DD
+    #endif
+
+    ...
+
+    #ifndef FUNC_FF
+    #define ELIDE_FUNC_FF
+    #define FUNC_FF(...)
+    #endif
+    ...
+    FUNC_FF(FUNC_MYFUNC_FF, "myfunc_ff", myfuncf, myfuncf2, vsMyfunc)
+    FUNC_FF(FUNC_FF_LAST,    NULL,       NULL,    NULL,     NULL)
+    #ifdef ELIDE_FUNC_FF
+    #undef ELIDE_FUNC_FF
+    #undef FUNC_FF
+    #endif
+
+* ``numexpr/msvc_function_stubs.hpp``
+In order to support float arguments, due to oddities of MSVC, you have to provide explicit support for your function in this file.
+Add ``#define myfuncf(x)  ((float)floor((double)(x)))`` (if your function is float -> float) to the ``#if`` clause at the top of the file
+which is for old versions of MSVC which did not have support for single precision fucntions. Then in the body, add an inline function
+
+.. code-block:: cpp
+
+    inline float myfuncf2(float x) {
+        return myfuncf(x);
+    }
+
+This is the function that appears as the ``f_win32`` parameter in ``functions.hpp``.
+
+* ``numexpr/tests/test_numexpr.py``
+Don't forget to add a test for your function!
+
+Adding function signatures
+--------------------------
+It may so happen that you cannot find your desired function signature in ``functions.hpp``. This means you will have to add it yourself!
+This involves editing a few more files. In addition, there may be certain bespoke changes, specific to the function signature
+that you may have to make (see Notes, below)
+
+* ``numexpr/functions.hpp``
+Firstly, add clause(s) for your function signature. For example, if the function signature is ``bool(double)`` and ``bool(float)``, add
+``FUNC_BD`` and ``FUNC_BF`` clauses (in the latter case you will need the macro to take 5 arguments for MSVC-compatibility.)
+
+.. code-block:: cpp
+
+    #ifndef FUNC_BD
+    #define ELIDE_FUNC_BD
+    #define FUNC_BD(...)
+    #endif
+    ...
+    FUNC_BD(FUNC_BD_LAST,    NULL,          NULL,  NULL)
+    #ifdef ELIDE_FUNC_BD
+    #undef ELIDE_FUNC_BD
+    #undef FUNC_BD
+    #endif
+
+    #ifndef FUNC_BF
+    #define ELIDE_FUNC_BF
+    #define FUNC_BF(...)
+    #endif
+    ...
+    FUNC_BF(FUNC_BF_LAST,    NULL,     NULL,     NULL,  NULL)
+    #ifdef ELIDE_FUNC_BF
+    #undef ELIDE_FUNC_BF
+    #undef FUNC_BF
+    #endif
+
+The ultimate source of the functions in the macro ``FUNC_BF(...)`` are the headers included in ``numexpr/interpreter.cpp`` (in particular
+``numexpr/numexpr_config.hpp``, which can be used to overwrite ``<math.h>`` functions), so the functions should be available from there.
+
+* ``numexpr/interp_body.cpp``
+Add case support for OPCODES associated to your new function signatures via e.g. ``case OP_FUNC_BFN`` and ``case OP_FUNC_BDN``, following
+the framework suggested by the other functions:
+
+.. code-block:: cpp
+
+    case OP_FUNC_BFN:
+    #ifdef USE_VML
+                VEC_ARG1_VML(functions_bf_vml[arg2](BLOCK_SIZE,
+                                                    (float*)x1, (bool*)dest));
+    #else
+                VEC_ARG1(b_dest = functions_bf[arg2](f1));
+    #endif
+
+Note that it is important that the out variable matches the output type of the function (i.e. ``b_dest`` for bool, ``f_dest`` for float etc.)
+
+* ``numexpr/interpreter.hpp``
+Add clauses to read the ``functions.hpp`` macros correctly
+
+.. code-block:: cpp
+
+    enum FuncBFCodes {
+    #define FUNC_BF(fop, ...) fop,
+    #include "functions.hpp"
+    #undef FUNC_BF
+    };
+
+* ``numexpr/interpreter.cpp``
+Add clauses to generate the FUNC_CODES from the ``functions.hpp`` header, making sure to include clauses for ``_WIN32`` and
+``VML`` as necessary accoridng to the framework suggested by the other functions.
+
+.. code-block:: cpp
+
+    typedef bool (*FuncBFPtr)(float);
+    #ifdef _WIN32
+    FuncBFPtr functions_bf[] = {
+    #define FUNC_BF(fop, s, f, f_win32, ...) f_win32,
+    #include "functions.hpp"
+    #undef FUNC_BF
+    };
+    #else
+    FuncBFPtr functions_bf[] = {
+    #define FUNC_BF(fop, s, f, ...) f,
+    #include "functions.hpp"
+    #undef FUNC_BF
+    };
+    #endif
+
+    #ifdef USE_VML
+    typedef void (*FuncBFPtr_vml)(MKL_INT, const float*, bool*);
+    FuncBFPtr_vml functions_bf_vml[] = {
+    #define FUNC_BF(fop, s, f, f_win32, f_vml) f_vml,
+    #include "functions.hpp"
+    #undef FUNC_BF
+    };
+    #endif
+
+Some functions (e.g. ``fmod``, ``isnan``) are not available in MKL, and so must be hard-coded in ``bespoke_functions.hpp`` as well:
+
+.. code-block:: cpp
+
+    #ifdef USE_VML
+    /* no isnan, isfinite or isinf in VML */
+    static void vdIsfinite(MKL_INT n, const double* x1, bool* dest)
+    {
+        MKL_INT j;
+        for (j=0; j<n; j++) {
+            dest[j] = isfinited(x1[j]);
+        };
+    };
+    #endif
+
+The complex case is slightly different (see other examples in the same file).
+
+Add case handling to the ``check_program`` function
+
+.. code-block:: cpp
+
+    else if (op == OP_FUNC_BDN) {
+        if (arg < 0 || arg >= FUNC_BD_LAST) {
+            PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc);
+            return -1;
+        }
+    }
+    else if (op == OP_FUNC_BFN) {
+        if (arg < 0 || arg >= FUNC_BF_LAST) {
+            PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc);
+            return -1;
+        }
+    }
+
+* ``numexpr/module.cpp``
+Add code here to define the ``FUNC_OA`` macros you require
+
+.. code-block:: cpp
+
+    #define FUNC_BF(name, sname, ...)  add_func(name, sname);
+    #define FUNC_BD(name, sname, ...)  add_func(name, sname);
+    ...
+    #include "functions.hpp"
+    ...
+    #undef FUNC_BD
+    #undef FUNC_BF
+
+* ``numexpr/opcodes.hpp``
+Finally, add the ``OP_FUNC_BDN`` etc. codes here. It is necessary for the OPCODES in the file to be in (ascending order) with
+``NOOP`` as 0 and ``OP_LAST`` as the largest number. Secondly, all reduction OPCODES must appear last. Hence, after adding your
+function signatures (just before the reduction OPCODES) it is necessary to increment all succeeding OPCODES.
+
+.. code-block:: cpp
+
+    OPCODE(106, OP_FUNC_BDN, "func_bdn", Tb, Td, Tn, T0)
+    OPCODE(107, OP_FUNC_BFN, "func_bfn", Tb, Tf, Tn, T0)
+
+Notes
+-----
+In many cases this process will not be very smooth since one relies on the internal C/C++ standard functions (which can be fussy, to varying degrees on different platforms). Some common gotchas are then:
+
+* OPCODES are currently only supported up to 255 - if it becomes necessary to increment further, one will have to change the ``latin_1`` encoding used in ``quadrupleToString`` in ``necompiler.py``. In addition, since the OPCDE table is assumed to be of type ``unsigned char`` the ``get_return_sig`` function in ``numexpr/interpreter.cpp`` may have to be changed (possibly other changes too).
+
+* Depending on the new function signature (above all if the out type is different to the input types), one may have to edit the ``__init__`` function in the ``FuncNode`` class in ``expressions.py``.
+
+* Functions which accept and/or return complex arguments must be added to the ``complex_functions.hpp`` file (take care when adding them in ``interpreter.cpp`` and ``interp_body.cpp``, since their signatures are usually a bit different).
+
+* Depending on MSVC support, namespace clashes, casting problems, it may be necessary to make various changes to ``numexpr/numexpr_config.hpp`` and ``numexpr/msvc_function_stubs.hpp``. For example, in PR #523, non-clashing wrappers were introduced for ``isnan`` and ``isfinite`` since the float versions ``isnanf, isfinitef`` were inconsistently defined (and output ints) - depending on how strict the platform interpreter is, the implicit cast from int to bool was acceptable or not for example. In addition, the base functions were in different namespaces or had different names across platforms.
diff --git a/ANNOUNCE.rst b/ANNOUNCE.rst
index 4725189..c3fda2c 100644
--- a/ANNOUNCE.rst
+++ b/ANNOUNCE.rst
@@ -1,34 +1,21 @@
 =========================
-Announcing NumExpr 2.10.1
+Announcing NumExpr 2.14.1
 =========================
 
 Hi everyone,
 
-NumExpr 2.10.1 continues to stabilize the support for NumPy 2.0.0.
-Also, the default number of 'safe' threads has been upgraded to 16
-(instead of previous 8). Finally, preliminary support for Python 3.13;
-thanks to Karolina Surma.
+NumExpr 2.14.1 introduces patches to ensure compatibility with NumPy 1.26,
+rolling back static typing support.
 
 Project documentation is available at:
 
-http://numexpr.readthedocs.io/
+https://numexpr.readthedocs.io/
 
-Changes from 2.10.0 to 2.10.1
+Changes from 2.14.0 to 2.14.1
 -----------------------------
 
-* The default number of 'safe' threads has been upgraded to 16 (instead of
-  previous 8). That means that if your CPU has > 16 cores, the default is
-  to use 16. You can always override this with the "NUMEXPR_MAX_THREADS"
-  environment variable.
-
-* NumPy 1.23 is now the minimum supported.
-
-* Preliminary support for Python 3.13. Thanks to Karolina Surma.
-
-* Fix tests on nthreads detection (closes: #479). Thanks to @avalentino.
-
-* The build process has been modernized and now uses the `pyproject.toml`
-  file for more of the configuration options.
+* Rolled back static typing support to ensure compatibiity with NumPy 1.26.
+* Added CI tests for NumPy 1.26
 
 What's Numexpr?
 ---------------
diff --git a/AUTHORS.txt b/AUTHORS.txt
index 88b9047..28d978c 100644
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@@ -23,9 +23,12 @@ Google Inc. contributed bug fixes.
 
 David Cox improved readability of the Readme.
 
-Robert A. McLeod contributed bug fixes and ported the documentation to 
+Robert A. McLeod contributed bug fixes and ported the documentation to
 numexpr.readthedocs.io. He has served as the maintainer of the package
 since 2016 to 2023.
 
 Teng Liu fixed many bugs, and in particular, contributed valuable fixes
 to the new regex sanitizer for expressions.
+
+Luke Shaw contributed a bunch of new functions, and expanded the amount
+of opcodes from 128 to 256.
diff --git a/README.rst b/README.rst
index 9033d51..98069ce 100644
--- a/README.rst
+++ b/README.rst
@@ -106,7 +106,7 @@ See `requirements.txt` for the required version of NumPy.
 
 NumExpr is built in the standard Python way::
 
-  python setup.py build install
+  pip install [-e] .
 
 You can test `numexpr` with::
 
@@ -159,6 +159,24 @@ Usage
   array([ True, False, False], dtype=bool)
 
 
+Free-threading support
+----------------------
+Starting on CPython 3.13 onwards there is a new distribution that disables the
+Global Interpreter Lock (GIL) altogether, thus increasing the performance yields
+under multi-threaded conditions on a single interpreter, as opposed to having to use
+multiprocessing.
+
+Whilst numexpr has been demonstrated to work under free-threaded
+CPython, considerations need to be taken when using numexpr native parallel
+implementation vs using Python threads directly in order to prevent oversubscription,
+we recommend either using the main CPython interpreter thread to spawn multiple C threads
+using the parallel numexpr API, or spawning multiple CPython threads that do not use
+the parallel API.
+
+For more information about free-threaded CPython, we recommend visiting the following
+`community Wiki <https://py-free-threading.github.io/>`
+
+
 Documentation
 -------------
 
diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
index c51964f..43ca0d1 100644
--- a/RELEASE_NOTES.rst
+++ b/RELEASE_NOTES.rst
@@ -1,12 +1,106 @@
 =====================================
-Release notes for NumExpr 2.10 series
+Release notes for NumExpr 2.14 series
 =====================================
 
-Changes from 2.10.1 to 2.10.2
+Changes from 2.14.1 to 2.14.2
 -----------------------------
 
 * **Under development.**
 
+Changes from 2.14.0 to 2.14.1
+-----------------------------
+
+* Rolled back static typing support to ensure compatibiity with NumPy 1.26.
+* Added CI tests for NumPy 1.26
+
+
+Changes from 2.13.1 to 2.14.0
+-----------------------------
+
+* Numerical stability for overflow has been improved for ``tan`` / ``tanh``
+  to handle possible overflows for complex numbers.
+
+* Static typing support has been added, making NumExpr compatible with
+  static type checkers like `mypy` and `pyright`.
+  Thanks to Joren Hammudoglu (@jorenham) for the work.
+
+
+Changes from 2.13.0 to 2.13.1
+-----------------------------
+
+* Patch to maximum/minimum functions in order to match NumPy NaN handling
+* Patch to convert '+'->'|' and '*'->'&' for booleans
+
+Changes from 2.12.1 to 2.13.0
+-----------------------------
+
+* New functionality has been added:
+ * Bitwise operators (and, or, not, xor): `&, |, ~, ^`
+ * New binary arithmetic operator for floor division: `//`
+ * New functions: `signbit`, `hypot`, `copysign`, `nextafter`, `maximum`, `minimum`, `log2`, `trunc`, `round` and `sign`.
+ * Also enables integer outputs for integer inputs for `abs`, `copy`, `ones_like`, `sign` and `round`.
+
+ Thanks to Luke Shaw for the contributions.
+
+* New wheels for Python 3.14 and 3.14t are provided.
+
+
+Changes from 2.12.0 to 2.12.1
+-----------------------------
+
+* Added complex counterparts for isnan/isfinite/isinf functions.
+  Thanks to Luke Shaw.
+
+* Updated documentation for the new functions and instructions
+  for adding new functions to the virtual machine.  Thanks to Luke Shaw.
+
+* Fixed MKL support; it was broken in 2.12.0.  Thanks to
+  Christoph Gohlke for reporting the issue.
+
+
+Changes from 2.11.0 to 2.12.0
+-----------------------------
+
+* Added isnan/isfinite/isinf functions.  Thanks to Luke Shaw.
+
+* New instructions for adding new functions to the virtual machine.
+  They are available at ADDFUNCS.rst.  Thanks to Luke Shaw.
+
+* We are distributing binary wheels for Python 3.14 and 3.14t now.
+
+* We are distributing musllinux wheels too!  Thanks to Clément Robert.
+
+
+Changes from 2.10.2 to 2.11.0
+-----------------------------
+
+* Initial support for free-threaded Python 3.13t has been added.
+  This is still experimental, so please report any issues you find.
+  For more info, see discussions PRs #504, #505 and #508.
+  Thanks to @andfoy, @rgommers and @FrancescAlted for the work.
+
+* Fix imaginary evaluation in the form of `1.1e1j`.  This was
+  previously not supported and would raise an error.  Thanks to @27rabbitlt
+  for the fix.
+
+* The test suite has been modernized to use `pytest` instead of `unittest`.
+  This should make it easier to run the tests and contribute to the project.
+
+* Python 3.10 is now the minimum supported version.
+
+
+Changes from 2.10.1 to 2.10.2
+-----------------------------
+
+* Better support for CPUs that do not have a power of 2 number of
+  cores.  See #479 and #490.  Thanks to @avalentino.
+
+* Allow numexpr to run with the multithreading package in Python.
+  See PR #496.  Thanks to @emmaai
+
+* Wheels for Python 3.13 are now provided.
+
+
 Changes from 2.10.0 to 2.10.1
 -----------------------------
 
diff --git a/VERSION b/VERSION
index 05a16b0..3047337 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.10.2.dev0
+2.14.2.dev0
diff --git a/bench/boolean_timing.py b/bench/boolean_timing.py
index fe07b31..0be0bf7 100644
--- a/bench/boolean_timing.py
+++ b/bench/boolean_timing.py
@@ -9,8 +9,10 @@
 ####################################################################
 
 from __future__ import print_function
+
 import sys
 import timeit
+
 import numpy
 
 array_size = 5_000_000
diff --git a/bench/free_threading.py b/bench/free_threading.py
new file mode 100644
index 0000000..cd00e78
--- /dev/null
+++ b/bench/free_threading.py
@@ -0,0 +1,171 @@
+#################################################################################
+# To compare the performance of numexpr when free-threading CPython is used.
+#
+# This example makes use of Python threads, as opposed to C native ones
+# in order to highlight the improvement introduced by free-threading CPython,
+# which now disables the GIL altogether.
+#################################################################################
+"""
+Results with GIL-enabled CPython:
+
+Benchmarking Expression 1:
+NumPy time (threaded over 32 chunks with 16 threads): 1.173090 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 0.951071 seconds
+numexpr speedup: 1.23x
+----------------------------------------
+Benchmarking Expression 2:
+NumPy time (threaded over 32 chunks with 16 threads): 10.410874 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 8.248753 seconds
+numexpr speedup: 1.26x
+----------------------------------------
+Benchmarking Expression 3:
+NumPy time (threaded over 32 chunks with 16 threads): 9.605909 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 11.087108 seconds
+numexpr speedup: 0.87x
+----------------------------------------
+Benchmarking Expression 4:
+NumPy time (threaded over 32 chunks with 16 threads): 3.836962 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 18.054531 seconds
+numexpr speedup: 0.21x
+----------------------------------------
+
+Results with free-threading CPython:
+
+Benchmarking Expression 1:
+NumPy time (threaded over 32 chunks with 16 threads): 3.415349 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 2.618876 seconds
+numexpr speedup: 1.30x
+----------------------------------------
+Benchmarking Expression 2:
+NumPy time (threaded over 32 chunks with 16 threads): 19.005238 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 12.611407 seconds
+numexpr speedup: 1.51x
+----------------------------------------
+Benchmarking Expression 3:
+NumPy time (threaded over 32 chunks with 16 threads): 20.555149 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 17.690749 seconds
+numexpr speedup: 1.16x
+----------------------------------------
+Benchmarking Expression 4:
+NumPy time (threaded over 32 chunks with 16 threads): 38.338372 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 35.074684 seconds
+numexpr speedup: 1.09x
+----------------------------------------
+"""
+
+import os
+
+os.environ["NUMEXPR_NUM_THREADS"] = "2"
+import threading
+import timeit
+
+import numpy as np
+
+import numexpr as ne
+
+array_size = 10**8
+num_runs = 10
+num_chunks = 32  # Number of chunks
+num_threads = 16  # Number of threads constrained by how many chunks memory can hold
+
+a = np.random.rand(array_size).reshape(10**4, -1)
+b = np.random.rand(array_size).reshape(10**4, -1)
+c = np.random.rand(array_size).reshape(10**4, -1)
+
+chunk_size = array_size // num_chunks
+
+expressions_numpy = [
+    lambda a, b, c: a + b * c,
+    lambda a, b, c: a**2 + b**2 - 2 * a * b * np.cos(c),
+    lambda a, b, c: np.sin(a) + np.log(b) * np.sqrt(c),
+    lambda a, b, c: np.exp(a) + np.tan(b) - np.sinh(c),
+]
+
+expressions_numexpr = [
+    "a + b * c",
+    "a**2 + b**2 - 2 * a * b * cos(c)",
+    "sin(a) + log(b) * sqrt(c)",
+    "exp(a) + tan(b) - sinh(c)",
+]
+
+
+def benchmark_numpy_chunk(func, a, b, c, results, indices):
+    for index in indices:
+        start = index * chunk_size
+        end = (index + 1) * chunk_size
+        time_taken = timeit.timeit(
+            lambda: func(a[start:end], b[start:end], c[start:end]), number=num_runs
+        )
+        results.append(time_taken)
+
+
+def benchmark_numexpr_re_evaluate(expr, a, b, c, results, indices):
+    for index in indices:
+        start = index * chunk_size
+        end = (index + 1) * chunk_size
+        # if index == 0:
+        # Evaluate the first chunk with evaluate
+        time_taken = timeit.timeit(
+            lambda: ne.evaluate(
+                expr,
+                local_dict={
+                    "a": a[start:end],
+                    "b": b[start:end],
+                    "c": c[start:end],
+                },
+            ),
+            number=num_runs,
+        )
+        results.append(time_taken)
+
+
+def run_benchmark_threaded():
+    chunk_indices = list(range(num_chunks))
+
+    for i in range(len(expressions_numpy)):
+        print(f"Benchmarking Expression {i+1}:")
+
+        results_numpy = []
+        results_numexpr = []
+
+        threads_numpy = []
+        for j in range(num_threads):
+            indices = chunk_indices[j::num_threads]  # Distribute chunks across threads
+            thread = threading.Thread(
+                target=benchmark_numpy_chunk,
+                args=(expressions_numpy[i], a, b, c, results_numpy, indices),
+            )
+            threads_numpy.append(thread)
+            thread.start()
+
+        for thread in threads_numpy:
+            thread.join()
+
+        numpy_time = sum(results_numpy)
+        print(
+            f"NumPy time (threaded over {num_chunks} chunks with {num_threads} threads): {numpy_time:.6f} seconds"
+        )
+
+        threads_numexpr = []
+        for j in range(num_threads):
+            indices = chunk_indices[j::num_threads]  # Distribute chunks across threads
+            thread = threading.Thread(
+                target=benchmark_numexpr_re_evaluate,
+                args=(expressions_numexpr[i], a, b, c, results_numexpr, indices),
+            )
+            threads_numexpr.append(thread)
+            thread.start()
+
+        for thread in threads_numexpr:
+            thread.join()
+
+        numexpr_time = sum(results_numexpr)
+        print(
+            f"numexpr time (threaded with re_evaluate over {num_chunks} chunks with {num_threads} threads): {numexpr_time:.6f} seconds"
+        )
+        print(f"numexpr speedup: {numpy_time / numexpr_time:.2f}x")
+        print("-" * 40)
+
+
+if __name__ == "__main__":
+    run_benchmark_threaded()
diff --git a/bench/issue-36.py b/bench/issue-36.py
index 9c356cf..611bddb 100644
--- a/bench/issue-36.py
+++ b/bench/issue-36.py
@@ -2,10 +2,14 @@
 # performs better than the serial code.  See issue #36 for details.
 
 from __future__ import print_function
+
+from time import time
+
 import numpy as np
-import numexpr as ne
 from numpy.testing import assert_array_equal
-from time import time
+
+import numexpr as ne
+
 
 def bench(N):
     print("*** array length:", N)
@@ -31,4 +35,3 @@ def bench(N):
     ne.set_num_threads(2)
     for N in range(10, 20):
         bench(2**N)
-
diff --git a/bench/issue-47.py b/bench/issue-47.py
index 31c68a6..a48fbe2 100644
--- a/bench/issue-47.py
+++ b/bench/issue-47.py
@@ -1,4 +1,5 @@
 import numpy
+
 import numexpr
 
 numexpr.set_num_threads(8)
diff --git a/bench/large_array_vs_numpy.py b/bench/large_array_vs_numpy.py
index 72219a1..b480261 100644
--- a/bench/large_array_vs_numpy.py
+++ b/bench/large_array_vs_numpy.py
@@ -31,10 +31,12 @@
 import os
 
 os.environ["NUMEXPR_NUM_THREADS"] = "16"
+import threading
+import timeit
+
 import numpy as np
+
 import numexpr as ne
-import timeit
-import threading
 
 array_size = 10**8
 num_runs = 10
diff --git a/bench/multidim.py b/bench/multidim.py
index 587f100..eeccd0b 100644
--- a/bench/multidim.py
+++ b/bench/multidim.py
@@ -12,9 +12,12 @@
 # Based on a script provided by Andrew Collette.
 
 from __future__ import print_function
+
+import time
+
 import numpy as np
+
 import numexpr as nx
-import time
 
 test_shapes = [
     (100*100*100),
@@ -90,5 +93,3 @@ def test_func(a, b, c):
     print("Simple: ", (stop1-start1)/nruns)
     print("Numexpr: ", (stop2-start2)/nruns)
     print("Chunked: ", (stop3-start3)/nruns)
-
-
diff --git a/bench/poly.py b/bench/poly.py
index 0f50290..3eb12b1 100644
--- a/bench/poly.py
+++ b/bench/poly.py
@@ -17,11 +17,13 @@
 #######################################################################
 
 from __future__ import print_function
+
 import sys
 from time import time
+
 import numpy as np
-import numexpr as ne
 
+import numexpr as ne
 
 #expr = ".25*x**3 + .75*x**2 - 1.5*x - 2"  # the polynomial to compute
 expr = "((.25*x + .75)*x - 1.5)*x - 2"  # a computer-friendly polynomial
diff --git a/bench/timing.py b/bench/timing.py
index c84a6f4..9c70610 100644
--- a/bench/timing.py
+++ b/bench/timing.py
@@ -9,7 +9,10 @@
 ####################################################################
 
 from __future__ import print_function
-import timeit, numpy
+
+import timeit
+
+import numpy
 
 array_size = 5e6
 iterations = 2
diff --git a/bench/unaligned-simple.py b/bench/unaligned-simple.py
index e168c78..b653c7a 100644
--- a/bench/unaligned-simple.py
+++ b/bench/unaligned-simple.py
@@ -13,8 +13,11 @@
 """
 
 from __future__ import print_function
+
 from timeit import Timer
+
 import numpy as np
+
 import numexpr as ne
 
 niter = 10
diff --git a/bench/varying-expr.py b/bench/varying-expr.py
index d04ab35..df7419c 100644
--- a/bench/varying-expr.py
+++ b/bench/varying-expr.py
@@ -13,9 +13,12 @@
 # the latency of numexpr when working with small arrays.
 
 from __future__ import print_function
+
 import sys
 from time import time
+
 import numpy as np
+
 import numexpr as ne
 
 N = 100
diff --git a/bench/vml_timing.py b/bench/vml_timing.py
index 52f5003..57dd4d2 100644
--- a/bench/vml_timing.py
+++ b/bench/vml_timing.py
@@ -9,9 +9,12 @@
 ####################################################################
 
 from __future__ import print_function
+
 import sys
 import timeit
+
 import numpy
+
 import numexpr
 
 array_size = 5_000_000
diff --git a/bench/vml_timing2.py b/bench/vml_timing2.py
index 32fdc62..4491162 100644
--- a/bench/vml_timing2.py
+++ b/bench/vml_timing2.py
@@ -4,13 +4,16 @@
 # https://github.com/pydata/numexpr/wiki/NumexprMKL
 
 from __future__ import print_function
+
 import datetime
 import sys
+from time import time
+
 import numpy as np
+
 import numexpr as ne
-from time import time
 
-N = int(2**26)
+N = int(2**28)
 
 x = np.linspace(0, 1, N)
 y = np.linspace(0, 1, N)
@@ -28,17 +31,17 @@
 print("Time for an algebraic expression:     %.3f s / %.3f GB/s" % (t1-t0, gbs))
 
 t0 = time()
-z = np.sin(x)**2 + np.cos(y)**2
+z = np.sin(x)**3.2 + np.cos(y)**3.2
 t1 = time()
 gbs = working_set_GB / (t1-t0)
 print("Time for a transcendental expression: %.3f s / %.3f GB/s" % (t1-t0, gbs))
 
 if ne.use_vml:
     ne.set_vml_num_threads(1)
-    ne.set_num_threads(8)
+    ne.set_num_threads(16)
     print("NumExpr version: %s, Using MKL ver. %s, Num threads: %s" % (ne.__version__, ne.get_vml_version(), ne.nthreads))
 else:
-    ne.set_num_threads(8)
+    ne.set_num_threads(16)
     print("NumExpr version: %s, Not Using MKL, Num threads: %s" % (ne.__version__, ne.nthreads))
 
 t0 = time()
@@ -48,7 +51,7 @@
 print("Time for an algebraic expression:     %.3f s / %.3f GB/s" % (t1-t0, gbs))
 
 t0 = time()
-ne.evaluate('sin(x)**2 + cos(y)**2', out = z)
+ne.evaluate('sin(x)**3.2 + cos(y)**3.2', out = z)
 t1 = time()
 gbs = working_set_GB / (t1-t0)
 print("Time for a transcendental expression: %.3f s / %.3f GB/s" % (t1-t0, gbs))
diff --git a/bench/vml_timing3.py b/bench/vml_timing3.py
index 04997ff..0086421 100644
--- a/bench/vml_timing3.py
+++ b/bench/vml_timing3.py
@@ -1,7 +1,9 @@
 # -*- coding: utf-8 -*-
+from timeit import default_timer as timer
+
 import numpy as np
+
 import numexpr as ne
-from timeit import default_timer as timer
 
 x = np.ones(100000)
 scaler = -1J
diff --git a/doc/api.rst b/doc/api.rst
index 7d750e3..5d1bb0f 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -3,11 +3,11 @@ NumExpr API
 
 .. automodule:: numexpr
    :members: evaluate, re_evaluate, disassemble, NumExpr, get_vml_version, set_vml_accuracy_mode, set_vml_num_threads, set_num_threads, detect_number_of_cores, detect_number_of_threads
-   
+
 .. py:attribute:: ncores
 
     The number of (virtual) cores detected.
-                  
+
 .. py:attribute:: nthreads
 
     The number of threads currently in-use.
@@ -18,11 +18,11 @@ NumExpr API
 
 .. py:attribute:: version
 
-    The version of NumExpr.      
-                  
-    
+    The version of NumExpr.
+
+
 Tests submodule
 ---------------
 
 .. automodule:: numexpr.tests
-   :members: test, print_versions
\ No newline at end of file
+   :members: test, print_versions
diff --git a/doc/conf.py b/doc/conf.py
index 6edbd64..60cf6c5 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -70,9 +70,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '2.8'
+version = '2.13'
 # The full version, including alpha/beta/rc tags.
-release = '2.8.5.dev1'
+release = '2.13.dev1'
 
 
 # The language for content autogenerated by Sphinx. Refer to documentation
diff --git a/doc/index.rst b/doc/index.rst
index 02922c3..d517391 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -25,4 +25,3 @@ Indices and tables
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
-
diff --git a/doc/intro.rst b/doc/intro.rst
index 11dbaaf..0d31925 100644
--- a/doc/intro.rst
+++ b/doc/intro.rst
@@ -1,25 +1,25 @@
 How it works
 ============
 
-The string passed to :code:`evaluate` is compiled into an object representing the 
+The string passed to :code:`evaluate` is compiled into an object representing the
 expression and types of the arrays used by the function :code:`numexpr`.
 
-The expression is first compiled using Python's :code:`compile` function (this means 
-that the expressions have to be valid Python expressions). From this, the 
-variable names can be taken. The expression is then evaluated using instances 
-of a special object that keep track of what is being done to them, and which 
+The expression is first compiled using Python's :code:`compile` function (this means
+that the expressions have to be valid Python expressions). From this, the
+variable names can be taken. The expression is then evaluated using instances
+of a special object that keep track of what is being done to them, and which
 builds up the parse tree of the expression.
 
-This parse tree is then compiled to a bytecode program, which describes how to 
-perform the operation element-wise. The virtual machine uses "vector registers": 
-each register is many elements wide (by default 4096 elements). The key to 
+This parse tree is then compiled to a bytecode program, which describes how to
+perform the operation element-wise. The virtual machine uses "vector registers":
+each register is many elements wide (by default 4096 elements). The key to
 NumExpr's speed is handling chunks of elements at a time.
 
-There are two extremes to evaluating an expression elementwise. You can do each 
-operation as arrays, returning temporary arrays. This is what you do when you 
-use NumPy: :code:`2*a+3*b` uses three temporary arrays as large as :code:`a` or 
-:code:`b`. This strategy wastes memory (a problem if your arrays are large), 
-and also is not a good use of cache memory: for large arrays, the results of 
+There are two extremes to evaluating an expression elementwise. You can do each
+operation as arrays, returning temporary arrays. This is what you do when you
+use NumPy: :code:`2*a+3*b` uses three temporary arrays as large as :code:`a` or
+:code:`b`. This strategy wastes memory (a problem if your arrays are large),
+and also is not a good use of cache memory: for large arrays, the results of
 :code:`2*a` and :code:`3*b` won't be in cache when you do the add.
 
 The other extreme is to loop over each element, as in::
@@ -27,13 +27,13 @@ The other extreme is to loop over each element, as in::
     for i in xrange(len(a)):
         c[i] = 2*a[i] + 3*b[i]
 
-This doesn't consume extra memory, and is good for the cache, but, if the 
-expression is not compiled to machine code, you will have a big case statement 
-(or a bunch of if's) inside the loop, which adds a large overhead for each 
+This doesn't consume extra memory, and is good for the cache, but, if the
+expression is not compiled to machine code, you will have a big case statement
+(or a bunch of if's) inside the loop, which adds a large overhead for each
 element, and will hurt the branch-prediction used on the CPU.
 
-:code:`numexpr` uses a in-between approach. Arrays are handled as chunks (of 
-4096 elements) at a time, using a register machine. As Python code, 
+:code:`numexpr` uses a in-between approach. Arrays are handled as chunks (of
+4096 elements) at a time, using a register machine. As Python code,
 it looks something like this::
 
     for i in xrange(0, len(a), 256):
@@ -44,11 +44,11 @@ it looks something like this::
        add(r2, r3, r2)
        c[i:i+128] = r2
 
-(remember that the 3-arg form stores the result in the third argument, 
-instead of allocating a new array). This achieves a good balance between 
-cache and branch-prediction. And the virtual machine is written entirely in 
-C, which makes it faster than the Python above.  Furthermore the virtual machine 
-is also multi-threaded, which allows for efficient parallelization of NumPy 
+(remember that the 3-arg form stores the result in the third argument,
+instead of allocating a new array). This achieves a good balance between
+cache and branch-prediction. And the virtual machine is written entirely in
+C, which makes it faster than the Python above.  Furthermore the virtual machine
+is also multi-threaded, which allows for efficient parallelization of NumPy
 operations.
 
 There is some more information and history at:
@@ -58,12 +58,12 @@ http://www.bitsofbits.com/2014/09/21/numpy-micro-optimization-and-numexpr/
 Expected performance
 ====================
 
-The range of speed-ups for NumExpr respect to NumPy can vary from 0.95x and 20x, 
-being 2x, 3x or 4x typical values, depending on the complexity of the 
-expression and the internal optimization of the operators used. The strided and 
-unaligned case has been optimized too, so if the expression contains such 
-arrays, the speed-up can increase significantly. Of course, you will need to 
-operate with large arrays (typically larger than the cache size of your CPU) 
+The range of speed-ups for NumExpr respect to NumPy can vary from 0.95x and 20x,
+being 2x, 3x or 4x typical values, depending on the complexity of the
+expression and the internal optimization of the operators used. The strided and
+unaligned case has been optimized too, so if the expression contains such
+arrays, the speed-up can increase significantly. Of course, you will need to
+operate with large arrays (typically larger than the cache size of your CPU)
 to see these improvements in performance.
 
 Here there are some real timings. For the contiguous case::
diff --git a/doc/mkl.rst b/doc/mkl.rst
index 6951655..0c706bb 100644
--- a/doc/mkl.rst
+++ b/doc/mkl.rst
@@ -1,19 +1,19 @@
 NumExpr with Intel MKL
 ======================
 
-Numexpr has support for Intel's VML (included in Intel's MKL) in order to 
-accelerate the evaluation of transcendental functions on Intel CPUs.  Here it 
+Numexpr has support for Intel's VML (included in Intel's MKL) in order to
+accelerate the evaluation of transcendental functions on Intel CPUs.  Here it
 is a small example on the kind of improvement you may get by using it.
 
 A first benchmark
 -----------------
 
-Firstly, we are going to exercise how MKL performs when computing a couple of 
-simple expressions.  One is a pure algebraic one: :code:`2*y + 4*x` and the other 
+Firstly, we are going to exercise how MKL performs when computing a couple of
+simple expressions.  One is a pure algebraic one: :code:`2*y + 4*x` and the other
 contains transcendental functions: :code:`sin(x)**2 + cos(y)**2`.
 
-For this, we are going to use this worksheet_.  I (Francesc Alted) ran this 
-benchmark on a Intel Xeon E3-1245 v5 @ 3.50GHz. Here are the results when 
+For this, we are going to use this worksheet_.  I (Francesc Alted) ran this
+benchmark on a Intel Xeon E3-1245 v5 @ 3.50GHz. Here are the results when
 not using MKL::
 
     NumPy version: 1.11.1
@@ -22,7 +22,7 @@ not using MKL::
     Numexpr version: 2.6.1. Using MKL: False
     Time for an algebraic expression:     0.058 s / 19.116 GB/s
     Time for a transcendental expression: 0.283 s / 3.950 GB/s
- 
+
 
 And now, using MKL::
 
@@ -34,14 +34,14 @@ And now, using MKL::
     Time for a transcendental expression: 0.075 s / 14.975 GB/s
 
 
-As you can see, numexpr using MKL can be up to 3.8x faster for the case of the 
-transcendental expression.  Also, you can notice that the pure algebraic 
-expression is not accelerated at all.  This is completely expected, as the 
-MKL is offering accelerations for CPU bounded functions (sin, cos, tan, exp, 
+As you can see, numexpr using MKL can be up to 3.8x faster for the case of the
+transcendental expression.  Also, you can notice that the pure algebraic
+expression is not accelerated at all.  This is completely expected, as the
+MKL is offering accelerations for CPU bounded functions (sin, cos, tan, exp,
 log, sinh...) and not pure multiplications or adds.
 
-Finally, note how numexpr+MKL can be up to 26x faster than using a pure NumPy 
-solution.  And this was using a processor with just four physical cores; you 
+Finally, note how numexpr+MKL can be up to 26x faster than using a pure NumPy
+solution.  And this was using a processor with just four physical cores; you
 should expect more speedup as you throw more cores at that.
 
 .. _worksheet: https://github.com/pydata/numexpr/blob/master/bench/vml_timing2.py
@@ -49,28 +49,28 @@ should expect more speedup as you throw more cores at that.
 More benchmarks (older)
 -----------------------
 
-Numexpr & VML can both use several threads for doing computations.  Let's see 
-how performance improves by using 1 or 2 threads on a 2-core Intel CPU (Core2 
+Numexpr & VML can both use several threads for doing computations.  Let's see
+how performance improves by using 1 or 2 threads on a 2-core Intel CPU (Core2
 E8400 @ 3.00GHz).
 
 Using 1 thread
 ^^^^^^^^^^^^^^
 
-Here we have some benchmarks on the improvement of speed that Intel's VML can 
-achieve.  First, look at times by some easy expression containing sine and 
+Here we have some benchmarks on the improvement of speed that Intel's VML can
+achieve.  First, look at times by some easy expression containing sine and
 cosine operations *without* using VML::
 
     In [17]: ne.use_vml
     Out[17]: False
-    
+
     In [18]: x = np.linspace(-1, 1, 1e6)
-    
+
     In [19]: timeit np.sin(x)**2+np.cos(x)**2
     10 loops, best of 3: 43.1 ms per loop
-    
+
     In [20]: ne.set_num_threads(1)
     Out[20]: 2
-    
+
     In [21]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     10 loops, best of 3: 29.5 ms per loop
 
@@ -79,15 +79,15 @@ and now using VML::
 
     In [37]: ne.use_vml
     Out[37]: True
-    
+
     In [38]: x = np.linspace(-1, 1, 1e6)
-    
+
     In [39]: timeit np.sin(x)**2+np.cos(x)**2
     10 loops, best of 3: 42.8 ms per loop
-    
+
     In [40]: ne.set_num_threads(1)
     Out[40]: 2
-    
+
     In [41]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     100 loops, best of 3: 19.8 ms per loop
 
@@ -96,37 +96,37 @@ Hey, VML can accelerate computations by a 50% using a single CPU.  That's great!
 Using 2 threads
 ^^^^^^^^^^^^^^^
 
-First, look at the time of the non-VML numexpr when using 2 threads:: 
+First, look at the time of the non-VML numexpr when using 2 threads::
 
     In [22]: ne.set_num_threads(2)
     Out[22]: 1
-    
+
     In [23]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     100 loops, best of 3: 15.3 ms per loop
 
-OK.  We've got an almost perfect 2x improvement in speed with regard to the 1 
+OK.  We've got an almost perfect 2x improvement in speed with regard to the 1
 thread case.  Let's see about the VML-powered numexpr version::
 
     In [43]: ne.set_num_threads(2)
     Out[43]: 1
-    
+
     In [44]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     100 loops, best of 3: 12.2 ms per loop
 
-Ok, that's about 1.6x improvement over the 1 thread VML computation, and 
-still a 25% of improvement over the non-VML version.  Good, native numexpr 
+Ok, that's about 1.6x improvement over the 1 thread VML computation, and
+still a 25% of improvement over the non-VML version.  Good, native numexpr
 multithreading code really looks very efficient!
 
 Numexpr native threading code vs VML's one
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-You may already know that both numexpr and Intel's VML do have support for 
-multithreaded computations, but you might be curious about which one is more 
-efficient, so here it goes a hint.  First, using the VML multithreaded 
+You may already know that both numexpr and Intel's VML do have support for
+multithreaded computations, but you might be curious about which one is more
+efficient, so here it goes a hint.  First, using the VML multithreaded
 implementation::
 
     In [49]: ne.set_vml_num_threads(2)
-    
+
     In [50]: ne.set_num_threads(1)
     Out[50]: 1
 
@@ -146,14 +146,14 @@ and now, using the native numexpr threading code::
     100 loops, best of 3: 12 ms per loop
 
 
-This means that numexpr's native multithreaded code is about 40% faster than 
-VML's for this case.  So, in general, you should use the former with numexpr 
+This means that numexpr's native multithreaded code is about 40% faster than
+VML's for this case.  So, in general, you should use the former with numexpr
 (and this is the default actually).
 
 Mixing numexpr's and VML multithreading capabilities
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Finally, you might be tempted to use both multithreading codes at the same 
+Finally, you might be tempted to use both multithreading codes at the same
 time, but you will be deceived about the improvement in performance::
 
     In [57]: ne.set_vml_num_threads(2)
@@ -161,7 +161,7 @@ time, but you will be deceived about the improvement in performance::
     In [58]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     100 loops, best of 3: 17.7 ms per loop
 
-Your code actually performs much worse.  That's normal too because you are 
-trying to run 4 threads on a 2-core CPU.  For CPUs with many cores, you may 
-want to try with different threading configurations, but as a rule of thumb, 
-numexpr's one will generally win.
\ No newline at end of file
+Your code actually performs much worse.  That's normal too because you are
+trying to run 4 threads on a 2-core CPU.  For CPUs with many cores, you may
+want to try with different threading configurations, but as a rule of thumb,
+numexpr's one will generally win.
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index 081e7f4..51d3212 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -1,4 +1,4 @@
 Release Notes
 =============
 
-.. include:: ../RELEASE_NOTES.rst
\ No newline at end of file
+.. include:: ../RELEASE_NOTES.rst
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index 3a3cf63..84f085d 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -1,5 +1,5 @@
-NumExpr 2.8 User Guide
-======================
+NumExpr User Guide
+==================
 
 The NumExpr package supplies routines for the fast evaluation of
 array expressions elementwise by using a vector-based virtual
@@ -30,7 +30,7 @@ and it can also re_evaluate an expression::
 Building
 --------
 
-*NumExpr* requires Python_ 3.7 or greater, and NumPy_ 1.13 or greater.  It is 
+*NumExpr* requires Python_ 3.7 or greater, and NumPy_ 1.13 or greater.  It is
 built in the standard Python way:
 
 .. code-block:: bash
@@ -39,7 +39,7 @@ built in the standard Python way:
 
 You must have a C-compiler (i.e. MSVC Build tools on Windows and GCC on Linux) installed.
 
-Then change to a directory that is not the repository directory (e.g. `/tmp`) and 
+Then change to a directory that is not the repository directory (e.g. `/tmp`) and
 test :code:`numexpr` with:
 
 .. code-block:: bash
@@ -73,23 +73,23 @@ affect performance).
 Threadpool Configuration
 ------------------------
 
-Threads are spawned at import-time, with the number being set by the environment 
-variable ``NUMEXPR_MAX_THREADS``. The default maximum thread count is **64**. 
+Threads are spawned at import-time, with the number being set by the environment
+variable ``NUMEXPR_MAX_THREADS``. The default maximum thread count is **64**.
 There is no advantage to spawning more threads than the number of virtual cores
-available on the computing node. Practically NumExpr scales at large thread 
-count (`> 8`) only on very large matrices (`> 2**22`). Spawning large numbers 
-of threads is not free, and can increase import times for NumExpr or packages 
+available on the computing node. Practically NumExpr scales at large thread
+count (`> 8`) only on very large matrices (`> 2**22`). Spawning large numbers
+of threads is not free, and can increase import times for NumExpr or packages
 that import it such as Pandas or PyTables.
 
-If desired, the number of threads in the pool used can be adjusted via an 
-environment variable, ``NUMEXPR_NUM_THREADS`` (preferred) or ``OMP_NUM_THREADS``. 
-Typically only setting ``NUMEXPR_MAX_THREADS`` is sufficient; the number of 
-threads used can be adjusted dynamically via ``numexpr.set_num_threads(int)``. 
+If desired, the number of threads in the pool used can be adjusted via an
+environment variable, ``NUMEXPR_NUM_THREADS`` (preferred) or ``OMP_NUM_THREADS``.
+Typically only setting ``NUMEXPR_MAX_THREADS`` is sufficient; the number of
+threads used can be adjusted dynamically via ``numexpr.set_num_threads(int)``.
 The number of threads can never exceed that set by ``NUMEXPR_MAX_THREADS``.
 
-If the user has not configured the environment prior to importing NumExpr, info 
-logs will be generated, and the initial number of threads *that are used*_ will 
-be set to the number of cores detected in the system or 8, whichever is *less*. 
+If the user has not configured the environment prior to importing NumExpr, info
+logs will be generated, and the initial number of threads *that are used*_ will
+be set to the number of cores detected in the system or 8, whichever is *less*.
 
 Usage::
 
@@ -111,16 +111,16 @@ function's frame (through the use of :code:`sys._getframe()`).
 Alternatively, they can be specified using the :code:`local_dict` or
 :code:`global_dict` arguments, or passed as keyword arguments.
 
-The :code:`optimization` parameter can take the values :code:`'moderate'` 
-or :code:`'aggressive'`.  :code:`'moderate'` means that no optimization is made 
-that can affect precision at all.  :code:`'aggressive'` (the default) means that 
-the expression can be rewritten in a way that precision *could* be affected, but 
-normally very little.  For example, in :code:`'aggressive'` mode, the 
-transformation :code:`x~**3` -> :code:`x*x*x` is made, but not in 
+The :code:`optimization` parameter can take the values :code:`'moderate'`
+or :code:`'aggressive'`.  :code:`'moderate'` means that no optimization is made
+that can affect precision at all.  :code:`'aggressive'` (the default) means that
+the expression can be rewritten in a way that precision *could* be affected, but
+normally very little.  For example, in :code:`'aggressive'` mode, the
+transformation :code:`x~**3` -> :code:`x*x*x` is made, but not in
 :code:`'moderate'` mode.
 
-The `truediv` parameter specifies whether the division is a 'floor division' 
-(False) or a 'true division' (True).  The default is the value of 
+The `truediv` parameter specifies whether the division is a 'floor division'
+(False) or a 'true division' (True).  The default is the value of
 `__future__.division` in the interpreter.  See PEP 238 for details.
 
 Expressions are cached, so reuse is fast.  Arrays or scalars are
@@ -164,22 +164,22 @@ Casting rules in NumExpr follow closely those of *NumPy*.  However, for
 implementation reasons, there are some known exceptions to this rule,
 namely:
 
-    * When an array with type :code:`int8`, :code:`uint8`, :code:`int16` or 
-      :code:`uint16` is used inside NumExpr, it is internally upcasted to an 
-      :code:`int` (or :code:`int32` in NumPy notation).                                         
-    * When an array with type :code:`uint32` is used inside NumExpr, it is 
-      internally upcasted to a :code:`long` (or :code:`int64` in NumPy notation).     
-    * A floating point function (e.g. :code:`sin`) acting on :code:`int8` or 
-      :code:`int16` types returns a :code:`float64` type, instead of the 
-      :code:`float32` that is returned by NumPy functions.  This is mainly due 
+    * When an array with type :code:`int8`, :code:`uint8`, :code:`int16` or
+      :code:`uint16` is used inside NumExpr, it is internally upcasted to an
+      :code:`int` (or :code:`int32` in NumPy notation).
+    * When an array with type :code:`uint32` is used inside NumExpr, it is
+      internally upcasted to a :code:`long` (or :code:`int64` in NumPy notation).
+    * A floating point function (e.g. :code:`sin`) acting on :code:`int8` or
+      :code:`int16` types returns a :code:`float64` type, instead of the
+      :code:`float32` that is returned by NumPy functions.  This is mainly due
       to the absence of native :code:`int8` or :code:`int16` types in NumExpr.
-    * In operations implying a scalar and an array, the normal rules of casting 
-      are used in NumExpr, in contrast with NumPy, where array types takes 
-      priority.  For example, if :code:`a` is an array of type :code:`float32` 
-      and :code:`b` is an scalar of type :code:`float64` (or Python :code:`float` 
-      type, which is equivalent), then :code:`a*b` returns a :code:`float64` in 
-      NumExpr, but a :code:`float32` in NumPy (i.e. array operands take priority 
-      in determining the result type).  If you need to keep the result a 
+    * In operations implying a scalar and an array, the normal rules of casting
+      are used in NumExpr, in contrast with NumPy, where array types takes
+      priority.  For example, if :code:`a` is an array of type :code:`float32`
+      and :code:`b` is an scalar of type :code:`float64` (or Python :code:`float`
+      type, which is equivalent), then :code:`a*b` returns a :code:`float64` in
+      NumExpr, but a :code:`float32` in NumPy (i.e. array operands take priority
+      in determining the result type).  If you need to keep the result a
       :code:`float32`, be sure you use a :code:`float32` scalar too.
 
 
@@ -188,10 +188,10 @@ Supported operators
 
 *NumExpr* supports the set of operators listed below:
 
-    * Bitwise operators (and, or, not, xor): :code:`&, |, ~, ^`
+    * Bitwise and logical operators (and, or, not, xor): :code:`&, |, ~, ^`
     * Comparison operators: :code:`<, <=, ==, !=, >=, >`
     * Unary arithmetic operators: :code:`-`
-    * Binary arithmetic operators: :code:`+, -, *, /, **, %, <<, >>`
+    * Binary arithmetic operators: :code:`+, -, *, /, //, **, %, <<, >>`
 
 
 Supported functions
@@ -199,42 +199,55 @@ Supported functions
 
 The next are the current supported set:
 
-    * :code:`where(bool, number1, number2): number` -- number1 if the bool condition 
+    * :code:`where(bool, number1, number2): number` -- number1 if the bool condition
       is true, number2 otherwise.
-    * :code:`{sin,cos,tan}(float|complex): float|complex` -- trigonometric sine, 
+    * :code:`{isinf, isnan, isfinite}(float|complex): bool` -- returns element-wise True
+      for ``inf`` or ``NaN``, ``NaN``, not ``inf`` respectively.
+    * :code:`signbit(float|complex): bool` -- returns element-wise True if signbit is set
+      False otherwise.
+    * :code:`{sin,cos,tan}(float|complex): float|complex` -- trigonometric sine,
       cosine or tangent.
-    * :code:`{arcsin,arccos,arctan}(float|complex): float|complex` -- trigonometric 
+    * :code:`{arcsin,arccos,arctan}(float|complex): float|complex` -- trigonometric
       inverse sine, cosine or tangent.
-    * :code:`arctan2(float1, float2): float` -- trigonometric inverse tangent of 
+    * :code:`arctan2(float1, float2): float` -- trigonometric inverse tangent of
       float1/float2.
-    * :code:`{sinh,cosh,tanh}(float|complex): float|complex` -- hyperbolic sine, 
+    * :code:`hypot(float1, float2): float` -- Euclidean distance between float1, float2
+    * :code:`nextafter(float1, float2): float` -- next representable floating-point value after
+      float1 in direction of float2
+    * :code:`copysign(float1, float2): float` -- return number with magnitude of float1 and
+      sign of float2
+    * :code:`{maximum,minimum}(float1, float2): float` -- return max/min of float1, float2
+    * :code:`{sinh,cosh,tanh}(float|complex): float|complex` -- hyperbolic sine,
       cosine or tangent.
-    * :code:`{arcsinh,arccosh,arctanh}(float|complex): float|complex` -- hyperbolic 
+    * :code:`{arcsinh,arccosh,arctanh}(float|complex): float|complex` -- hyperbolic
       inverse sine, cosine or tangent.
-    * :code:`{log,log10,log1p}(float|complex): float|complex` -- natural, base-10 and 
+    * :code:`{log,log10,log1p,log2}(float|complex): float|complex` -- natural, base-10 and
       log(1+x) logarithms.
-    * :code:`{exp,expm1}(float|complex): float|complex` -- exponential and exponential 
+    * :code:`{exp,expm1}(float|complex): float|complex` -- exponential and exponential
       minus one.
     * :code:`sqrt(float|complex): float|complex` -- square root.
-    * :code:`abs(float|complex): float|complex`  -- absolute value.
+    * :code:`trunc(float): float` -- round towards zero
+    * :code:`round(float|complex|int): float|complex|int` -- round to nearest integer (`rint`)
+    * :code:`sign(float|complex|int): float|complex|int` -- return -1, 0, +1 depending on sign
+    * :code:`abs(float|complex|int): float|complex|int`  -- absolute value.
     * :code:`conj(complex): complex` -- conjugate value.
     * :code:`{real,imag}(complex): float` -- real or imaginary part of complex.
-    * :code:`complex(float, float): complex` -- complex from real and imaginary 
+    * :code:`complex(float, float): complex` -- complex from real and imaginary
       parts.
-    * :code:`contains(np.str, np.str): bool` -- returns True for every string in :code:`op1` that 
+    * :code:`contains(np.str, np.str): bool` -- returns True for every string in :code:`op1` that
       contains :code:`op2`.
 
 Notes
 -----
 
     * :code:`abs()` for complex inputs returns a :code:`complex` output too.  This is a
-      departure from NumPy where a :code:`float` is returned instead.  However, 
-      NumExpr is not flexible enough yet so as to allow this to happen.  
-      Meanwhile, if you want to mimic NumPy behaviour, you may want to select the 
-      real part via the :code:`real` function (e.g. :code:`real(abs(cplx))`) or via the 
+      departure from NumPy where a :code:`float` is returned instead.  However,
+      NumExpr is not flexible enough yet so as to allow this to happen.
+      Meanwhile, if you want to mimic NumPy behaviour, you may want to select the
+      real part via the :code:`real` function (e.g. :code:`real(abs(cplx))`) or via the
       :code:`real` selector (e.g. :code:`abs(cplx).real`).
 
-More functions can be added if you need them. Note however that NumExpr 2.6 is 
+More functions can be added if you need them. Note however that NumExpr 2.6 is
 in maintenance mode and a new major revision is under development.
 
 Supported reduction operations
@@ -242,12 +255,12 @@ Supported reduction operations
 
 The next are the current supported set:
 
-  * :code:`sum(number, axis=None)`: Sum of array elements over a given axis.  
+  * :code:`sum(number, axis=None)`: Sum of array elements over a given axis.
     Negative axis are not supported.
-  * :code:`prod(number, axis=None)`: Product of array elements over a given axis.  
+  * :code:`prod(number, axis=None)`: Product of array elements over a given axis.
     Negative axis are not supported.
 
-*Note:* because of internal limitations, reduction operations must appear the 
+*Note:* because of internal limitations, reduction operations must appear the
 last in the stack.  If not, it will be issued an error like::
 
     >>> ne.evaluate('sum(1)*(-1)')
@@ -256,23 +269,23 @@ last in the stack.  If not, it will be issued an error like::
 General routines
 ----------------
 
-  * :code:`evaluate(expression, local_dict=None, global_dict=None, 
-    optimization='aggressive', truediv='auto')`:  Evaluate a simple array 
+  * :code:`evaluate(expression, local_dict=None, global_dict=None,
+    optimization='aggressive', truediv='auto')`:  Evaluate a simple array
     expression element-wise.  See examples above.
-  * :code:`re_evaluate(local_dict=None)`:  Re-evaluate the last array expression 
-    without any check.  This is meant for accelerating loops that are re-evaluating 
-    the same expression repeatedly without changing anything else than the operands.  
+  * :code:`re_evaluate(local_dict=None)`:  Re-evaluate the last array expression
+    without any check.  This is meant for accelerating loops that are re-evaluating
+    the same expression repeatedly without changing anything else than the operands.
     If unsure, use evaluate() which is safer.
   * :code:`test()`:  Run all the tests in the test suite.
   * :code:`print_versions()`:  Print the versions of software that numexpr relies on.
-  * :code:`set_num_threads(nthreads)`: Sets a number of threads to be used in operations.  
-    Returns the previous setting for the number of threads.  See note below to see 
+  * :code:`set_num_threads(nthreads)`: Sets a number of threads to be used in operations.
+    Returns the previous setting for the number of threads.  See note below to see
     how the number of threads is set via environment variables.
 
-    If you are using VML, you may want to use *set_vml_num_threads(nthreads)* to 
-    perform the parallel job with VML instead.  However, you should get very 
-    similar performance with VML-optimized functions, and VML's parallelizer 
-    cannot deal with common expressions like `(x+1)*(x-2)`, while NumExpr's 
+    If you are using VML, you may want to use *set_vml_num_threads(nthreads)* to
+    perform the parallel job with VML instead.  However, you should get very
+    similar performance with VML-optimized functions, and VML's parallelizer
+    cannot deal with common expressions like `(x+1)*(x-2)`, while NumExpr's
     one can.
 
   * :code:`detect_number_of_cores()`: Detects the number of cores on a system.
@@ -324,4 +337,4 @@ License
 
 NumExpr is distributed under the MIT_ license.
 
-.. _MIT: http://www.opensource.org/licenses/mit-license.php
\ No newline at end of file
+.. _MIT: http://www.opensource.org/licenses/mit-license.php
diff --git a/doc/vm2.rst b/doc/vm2.rst
index 45e9fc9..01c9826 100644
--- a/doc/vm2.rst
+++ b/doc/vm2.rst
@@ -1,32 +1,32 @@
 Performance of the Virtual Machine in NumExpr2.0
 ================================================
 
-Numexpr 2.0 leverages a new virtual machine completely based on the new ndarray 
-iterator introduced in NumPy 1.6.  This represents a nice combination of the 
-advantages of using the new iterator, while retaining the ability to avoid 
-copies in memory as well as the multi-threading capabilities of the previous 
+Numexpr 2.0 leverages a new virtual machine completely based on the new ndarray
+iterator introduced in NumPy 1.6.  This represents a nice combination of the
+advantages of using the new iterator, while retaining the ability to avoid
+copies in memory as well as the multi-threading capabilities of the previous
 virtual machine (1.x series).
 
-The increased performance of the new virtual machine can be seen in several 
+The increased performance of the new virtual machine can be seen in several
 scenarios, like:
 
-  * *Broadcasting*.  Expressions containing arrays that needs to be broadcasted, 
+  * *Broadcasting*.  Expressions containing arrays that needs to be broadcasted,
     will not need additional memory (i.e. they will be broadcasted on-the-fly).
-  * *Non-native dtypes*.  These will be translated to native dtypes on-the-fly, 
+  * *Non-native dtypes*.  These will be translated to native dtypes on-the-fly,
     so there is not need to convert the whole arrays first.
-  * *Fortran-ordered arrays*.  The new iterator will find the best path to 
+  * *Fortran-ordered arrays*.  The new iterator will find the best path to
     optimize operations on such arrays, without the need to transpose them first.
 
-There is a drawback though: performance with small arrays suffers a bit because 
-of higher set-up times for the new virtual machine.  See below for detailed 
+There is a drawback though: performance with small arrays suffers a bit because
+of higher set-up times for the new virtual machine.  See below for detailed
 benchmarks.
 
 Some benchmarks for best-case scenarios
 ---------------------------------------
 
-Here you have some benchmarks of some scenarios where the new virtual machine 
-actually represents an advantage in terms of speed (also memory, but this is 
-not shown here).  As you will see, the improvement is notable in many areas, 
+Here you have some benchmarks of some scenarios where the new virtual machine
+actually represents an advantage in terms of speed (also memory, but this is
+not shown here).  As you will see, the improvement is notable in many areas,
 ranging from 3x to 6x faster operations.
 
 Broadcasting
@@ -85,7 +85,7 @@ Mix of 'non-native' arrays, Fortran-ordered, and using broadcasting
 Longer setup-time
 ^^^^^^^^^^^^^^^^^
 
-The only drawback of the new virtual machine is during the computation of 
+The only drawback of the new virtual machine is during the computation of
 small arrays::
 
     >>> a = np.arange(10)
@@ -98,8 +98,8 @@ small arrays::
     10000 loops, best of 3: 30.6 µs per loop
 
 
-i.e. the new virtual machine takes a bit more time to set-up (around 8 µs in 
-this machine).  However, this should be not too important because for such a 
+i.e. the new virtual machine takes a bit more time to set-up (around 8 µs in
+this machine).  However, this should be not too important because for such a
 small arrays NumPy is always a better option::
 
     >>> timeit c = a*(b+1)
@@ -121,8 +121,8 @@ And for arrays large enough the difference is negligible::
 Conclusion
 ----------
 
-The new virtual machine introduced in numexpr 2.0 brings more performance in 
-many different scenarios (broadcast, non-native dtypes, Fortran-orderd arrays), 
-while it shows slightly worse performance for small arrays.  However, as 
-numexpr is more geared to compute large arrays, the new virtual machine should 
-be good news for numexpr users in general.
\ No newline at end of file
+The new virtual machine introduced in numexpr 2.0 brings more performance in
+many different scenarios (broadcast, non-native dtypes, Fortran-orderd arrays),
+while it shows slightly worse performance for small arrays.  However, as
+numexpr is more geared to compute large arrays, the new virtual machine should
+be good news for numexpr users in general.
diff --git a/issues/issue418.py b/issues/issue418.py
index b871c65..31ca2fc 100644
--- a/issues/issue418.py
+++ b/issues/issue418.py
@@ -1,7 +1,9 @@
+from time import perf_counter as pc
+
+import matplotlib.pyplot as plt
 import numpy as np
+
 import numexpr as ne
-import matplotlib.pyplot as plt
-from time import perf_counter as pc
 
 # geomspace seems to be very slow, just a warning about setting `n` too high.
 # n = 2**24
diff --git a/numexpr/__init__.py b/numexpr/__init__.py
index 648b869..63bb9e9 100644
--- a/numexpr/__init__.py
+++ b/numexpr/__init__.py
@@ -21,21 +21,20 @@
 
 """
 
-from numexpr.interpreter import MAX_THREADS, use_vml, __BLOCK_SIZE1__
+from numexpr.interpreter import __BLOCK_SIZE1__, MAX_THREADS, use_vml
 
 is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE
 
-# cpuinfo imports were moved into the test submodule function that calls them 
+# cpuinfo imports were moved into the test submodule function that calls them
 # to improve import times.
 
 from numexpr.expressions import E
-from numexpr.necompiler import (NumExpr, disassemble, evaluate, re_evaluate, 
-    validate)
-
-from numexpr.utils import (_init_num_threads,
-    get_vml_version, set_vml_accuracy_mode, set_vml_num_threads,
-    set_num_threads, get_num_threads,
-    detect_number_of_cores, detect_number_of_threads)
+from numexpr.necompiler import (NumExpr, disassemble, evaluate, re_evaluate,
+                                validate)
+from numexpr.utils import (_init_num_threads, detect_number_of_cores,
+                           detect_number_of_threads, get_num_threads,
+                           get_vml_version, set_num_threads,
+                           set_vml_accuracy_mode, set_vml_num_threads)
 
 # Detect the number of cores
 ncores = detect_number_of_cores()
@@ -45,6 +44,7 @@
 # set_vml_num_threads(1)
 
 from . import version
+
 __version__ = version.version
 
 def print_versions():
@@ -63,4 +63,4 @@ def test(verbosity=1):
         return numexpr.tests.test(verbosity=verbosity)
     except ImportError:
         # To maintain Python 2.6 compatibility we have simple error handling
-        raise ImportError('`numexpr.tests` could not be imported, likely it was excluded from the distribution.')
\ No newline at end of file
+        raise ImportError('`numexpr.tests` could not be imported, likely it was excluded from the distribution.')
diff --git a/numexpr/bespoke_functions.hpp b/numexpr/bespoke_functions.hpp
new file mode 100644
index 0000000..26f784e
--- /dev/null
+++ b/numexpr/bespoke_functions.hpp
@@ -0,0 +1,339 @@
+#include <numpy/npy_cpu.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <vector>
+#include "numexpr_config.hpp" // isnan definitions
+
+// Generic sign function
+inline int signi(int x) {return (0 < x) - (x < 0);}
+inline long signl(long x) {return (0 < x) - (x < 0);}
+inline double sign(double x){
+        // Floats: -1.0, 0.0, +1.0, NaN stays NaN
+        if (isnand(x)) {return NAN;}
+        if (x > 0) {return 1;}
+        if (x < 0) {return -1;}
+        return 0; // handles +0.0 and -0.0
+    }
+inline float signf(float x){
+        // Floats: -1.0, 0.0, +1.0, NaN stays NaN
+        if (isnanf_(x)) {return NAN;}
+        if (x > 0) {return 1;}
+        if (x < 0) {return -1;}
+        return 0; // handles +0.0 and -0.0
+    }
+
+// round function for ints
+inline int rinti(int x) {return x;}
+inline long rintl(long x) {return x;}
+// abs function for ints
+inline int fabsi(int x) {return x<0 ? -x: x;}
+inline long fabsl(long x) {return x<0 ? -x: x;}
+// fmod function for ints
+// TODO: Have to add FUNC_III, FUNC_LLL signatures to functions.hpp to enable these
+// inline int fmodi(int x, int y) {return (int)fmodf((float)x, (float)y);}
+// inline long fmodl(long x, long y)  {return (long)fmodf((long)x, (long)y);}
+
+#ifdef USE_VML
+// To match Numpy behaviour for NaNs
+static void vsFmax_(MKL_INT n, const float* x1, const float* x2, float* dest)
+{
+    vsFmax(n, x1, x2, dest);
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        if (isnanf_(x1[j]) | isnanf_(x2[j])){
+            dest[j] = NAN;
+        }
+    };
+};
+
+static void vsFmin_(MKL_INT n, const float* x1, const float* x2, float* dest)
+{
+    vsFmin(n, x1, x2, dest);
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        if (isnanf_(x1[j]) | isnanf_(x2[j])){
+            dest[j] = NAN;
+        }
+    };
+};
+// To match Numpy behaviour for NaNs
+static void vdFmax_(MKL_INT n, const double* x1, const double* x2, double* dest)
+{
+    vdFmax(n, x1, x2, dest);
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        if (isnand(x1[j]) | isnand(x2[j])){
+            dest[j] = NAN;
+        }
+    };
+};
+
+static void vdFmin_(MKL_INT n, const double* x1, const double* x2, double* dest)
+{
+    vdFmin(n, x1, x2, dest);
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        if (isnand(x1[j]) | isnand(x2[j])){
+            dest[j] = NAN;
+        }
+    };
+};
+
+static void viRint(MKL_INT n, const int* x, int* dest)
+{
+    memcpy(dest, x, n * sizeof(int)); // just copy x1 which is already int
+};
+
+static void vlRint(MKL_INT n, const long* x, long* dest)
+{
+    memcpy(dest, x, n * sizeof(long)); // just copy x1 which is already int
+};
+
+static void viFabs(MKL_INT n, const int* x, int* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = x[j] < 0 ? -x[j]: x[j];
+    };
+};
+
+static void vlFabs(MKL_INT n, const long* x, long* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = x[j] < 0 ? -x[j]: x[j];
+    };
+};
+
+/* Fake vsConj function just for casting purposes inside numexpr */
+static void vsConj(MKL_INT n, const float* x1, float* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = x1[j];
+    };
+};
+
+/* fmod not available in VML */
+static void vsfmod(MKL_INT n, const float* x1, const float* x2, float* dest)
+{
+    MKL_INT j;
+    for(j=0; j < n; j++) {
+    dest[j] = fmodf(x1[j], x2[j]);
+    };
+}
+static void vdfmod(MKL_INT n, const double* x1, const double* x2, double* dest)
+{
+    MKL_INT j;
+    for(j=0; j < n; j++) {
+    dest[j] = fmod(x1[j], x2[j]);
+    };
+};
+// TODO: Have to add FUNC_III, FUNC_LLL signatures to functions.hpp
+// static void vifmod(MKL_INT n, const int* x1, const int* x2, int* dest)
+// {
+//     MKL_INT j;
+//     for(j=0; j < n; j++) {
+//     dest[j] = fmodi(x1[j], x2[j]);
+//     };
+// };
+// static void vlfmod(MKL_INT n, const long* x1, const long* x2, long* dest)
+// {
+//     MKL_INT j;
+//     for(j=0; j < n; j++) {
+//     dest[j] = fmodl(x1[j], x2[j]);
+//     };
+// };
+
+/* no isnan, isfinite, isinf or signbit in VML */
+static void vsIsfinite(MKL_INT n, const float* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isfinitef_(x1[j]);
+    };
+};
+static void vsIsinf(MKL_INT n, const float* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isinff_(x1[j]);
+    };
+};
+static void vsIsnan(MKL_INT n, const float* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isnanf_(x1[j]);
+    };
+};
+static void vsSignBit(MKL_INT n, const float* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = signbitf(x1[j]);
+    };
+};
+
+/* no isnan, isfinite, isinf, signbit in VML */
+static void vdIsfinite(MKL_INT n, const double* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isfinited(x1[j]);
+    };
+};
+static void vdIsinf(MKL_INT n, const double* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isinfd(x1[j]);
+    };
+};
+static void vdIsnan(MKL_INT n, const double* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isnand(x1[j]);
+    };
+};
+static void vdSignBit(MKL_INT n, const double* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = signbit(x1[j]);
+    };
+};
+
+/* no isnan, isfinite or isinf in VML */
+static void vzIsfinite(MKL_INT n, const MKL_Complex16* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isfinited(x1[j].real) && isfinited(x1[j].imag);
+    };
+};
+static void vzIsinf(MKL_INT n, const MKL_Complex16* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isinfd(x1[j].real) || isinfd(x1[j].imag);
+    };
+};
+static void vzIsnan(MKL_INT n, const MKL_Complex16* x1, bool* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = isnand(x1[j].real) || isnand(x1[j].imag);
+    };
+};
+
+/* Fake vdConj function just for casting purposes inside numexpr */
+static void vdConj(MKL_INT n, const double* x1, double* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j] = x1[j];
+    };
+};
+
+/* various functions not available in VML */
+static void vzExpm1(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
+{
+    MKL_INT j;
+    vzExp(n, x1, dest);
+    for (j=0; j<n; j++) {
+        dest[j].real -= 1.0;
+    };
+};
+
+static void vzLog1p(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j].real = x1[j].real + 1;
+        dest[j].imag = x1[j].imag;
+    };
+    vzLn(n, dest, dest);
+};
+
+static void vzLog2(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
+{
+    MKL_INT j;
+    vzLn(n, x1, dest);
+    for (j=0; j<n; j++) {
+        dest[j].real = dest[j].real * M_LOG2_E;
+        dest[j].imag = dest[j].imag * M_LOG2_E;
+    };
+};
+
+static void vzRint(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j].real = rint(x1[j].real);
+        dest[j].imag = rint(x1[j].imag);
+    };
+};
+
+/* Use this instead of native vzAbs in VML as it seems to work badly */
+static void vzAbs_(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
+{
+    MKL_INT j;
+    for (j=0; j<n; j++) {
+        dest[j].real = sqrt(x1[j].real*x1[j].real + x1[j].imag*x1[j].imag);
+        dest[j].imag = 0;
+    };
+};
+
+/*sign functions*/
+static void vsSign(MKL_INT n, const float* x1, float* dest)
+{
+    MKL_INT j;
+    for(j=0; j < n; j++) {
+        dest[j] = signf(x1[j]);
+    };
+};
+static void vdSign(MKL_INT n, const double* x1, double* dest)
+{
+    MKL_INT j;
+    for(j=0; j < n; j++) {
+        dest[j] = sign(x1[j]);
+    };
+};
+static void viSign(MKL_INT n, const int* x1, int* dest)
+{
+    MKL_INT j;
+    for(j=0; j < n; j++) {
+        dest[j] = signi(x1[j]);
+    };
+};
+static void vlSign(MKL_INT n, const long* x1, long* dest)
+{
+    MKL_INT j;
+    for(j=0; j < n; j++) {
+        dest[j] = signl(x1[j]);
+    };
+};
+static void vzSign(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
+{
+    MKL_INT j;
+    double mag;
+    for(j=0; j < n; j++) {
+        mag = sqrt(x1[j].real*x1[j].real + x1[j].imag*x1[j].imag);
+        if (isnand(mag)) {
+            dest[j].real = NAN;
+            dest[j].imag = NAN;
+        }
+        else if (mag == 0) {
+            dest[j].real = 0;
+            dest[j].imag = 0;
+        }
+        else {
+            dest[j].real = x1[j].real / mag;
+            dest[j].imag = x1[j].imag / mag;
+        }
+    };
+};
+#endif
diff --git a/numexpr/complex_functions.hpp b/numexpr/complex_functions.hpp
index ae89167..e0ff8a2 100644
--- a/numexpr/complex_functions.hpp
+++ b/numexpr/complex_functions.hpp
@@ -11,6 +11,7 @@
 **********************************************************************/
 
 // Replace npy_cdouble with std::complex<double>
+#include <math.h> // NAN
 #include <complex>
 
 /* constants */
@@ -347,6 +348,8 @@ nc_cosh(std::complex<double> *x, std::complex<double> *r)
 
 
 #define M_LOG10_E 0.434294481903251827651128918916605082294397
+#define M_LOG2_E  1.44269504088896340735992468100189213742664
+
 
 static void
 nc_log10(std::complex<double> *x, std::complex<double> *r)
@@ -357,6 +360,15 @@ nc_log10(std::complex<double> *x, std::complex<double> *r)
     return;
 }
 
+static void
+nc_log2(std::complex<double> *x, std::complex<double> *r)
+{
+    nc_log(x, r);
+    r->real(r->real() * M_LOG2_E);
+    r->imag(r->imag() * M_LOG2_E);
+    return;
+}
+
 static void
 nc_sin(std::complex<double> *x, std::complex<double> *r)
 {
@@ -378,42 +390,45 @@ nc_sinh(std::complex<double> *x, std::complex<double> *r)
 static void
 nc_tan(std::complex<double> *x, std::complex<double> *r)
 {
-    double sr,cr,shi,chi;
-    double rs,is,rc,ic;
-    double d;
-    double xr=x->real(), xi=x->imag();
-    sr = sin(xr);
-    cr = cos(xr);
-    shi = sinh(xi);
-    chi = cosh(xi);
-    rs = sr*chi;
-    is = cr*shi;
-    rc = cr*chi;
-    ic = -sr*shi;
-    d = rc*rc + ic*ic;
-    r->real((rs*rc+is*ic)/d);
-    r->imag((is*rc-rs*ic)/d);
+    double xr = x->real();
+    double xi = x->imag();
+    double imag_part;
+
+    double denom = cos(2*xr) + cosh(2*xi);
+    // handle overflows
+    if (xi > 20) {
+        imag_part = 1.0 / (1.0 + exp(-4*xi));
+    } else if (xi < -20) {
+        imag_part = -1.0 / (1.0 + exp(4*xi));
+    } else {
+        imag_part = sinh(2*xi) / denom;
+    }
+    double real_part = sin(2*xr) / denom;
+
+    r->real(real_part);
+    r->imag(imag_part);
     return;
 }
 
 static void
 nc_tanh(std::complex<double> *x, std::complex<double> *r)
 {
-    double si,ci,shr,chr;
-    double rs,is,rc,ic;
-    double d;
-    double xr=x->real(), xi=x->imag();
-    si = sin(xi);
-    ci = cos(xi);
-    shr = sinh(xr);
-    chr = cosh(xr);
-    rs = ci*shr;
-    is = si*chr;
-    rc = ci*chr;
-    ic = si*shr;
-    d = rc*rc + ic*ic;
-    r->real((rs*rc+is*ic)/d);
-    r->imag((is*rc-rs*ic)/d);
+    double xr = x->real();
+    double xi = x->imag();
+    double real_part;
+    double denom = cosh(2*xr) + cos(2*xi);
+    // handle overflows
+    if (xr > 20) {
+        real_part = 1.0 / (1.0 + exp(-4*xr));
+    } else if (xr < -20) {
+        real_part = -1.0 / (1.0 + exp(4*xr));
+    } else {
+        real_part = sinh(2*xr) / denom;
+    }
+    double imag_part = sin(2*xi) / denom;
+
+    r->real(real_part);
+    r->imag(imag_part);
     return;
 }
 
@@ -424,4 +439,60 @@ nc_abs(std::complex<double> *x, std::complex<double> *r)
     r->imag(0);
 }
 
+static void
+nc_rint(std::complex<double> *x, std::complex<double> *r)
+{
+    r->real(rint(x->real()));
+    r->imag(rint(x->imag()));
+}
+
+static bool
+nc_isinf(std::complex<double> *x)
+{
+    double xr=x->real(), xi=x->imag();
+    bool bi,br;
+    bi = isinfd(xi);
+    br = isinfd(xr);
+    return bi || br;
+}
+
+static bool
+nc_isnan(std::complex<double> *x)
+{
+    double xr=x->real(), xi=x->imag();
+    bool bi,br;
+    bi = isnand(xi);
+    br = isnand(xr);
+    return bi || br;
+}
+
+static bool
+nc_isfinite(std::complex<double> *x)
+{
+    double xr=x->real(), xi=x->imag();
+    bool bi,br;
+    bi = isfinited(xi);
+    br = isfinited(xr);
+    return bi && br;
+}
+
+static void
+nc_sign(std::complex<double> *x, std::complex<double> *r)
+{
+    if (nc_isnan(x)){
+        r->real(NAN);
+        r->imag(NAN);
+    }
+    std::complex<double> mag;
+    nc_abs(x, &mag);
+    if (mag.real() == 0){
+        r->real(0);
+        r->imag(0);
+    }
+    else{
+        r->real(x->real()/mag.real());
+        r->imag(x->imag()/mag.real());
+    }
+}
+
 #endif // NUMEXPR_COMPLEX_FUNCTIONS_HPP
diff --git a/numexpr/cpuinfo.py b/numexpr/cpuinfo.py
index 4a57d3c..897a4ca 100755
--- a/numexpr/cpuinfo.py
+++ b/numexpr/cpuinfo.py
@@ -23,12 +23,14 @@
 
 __all__ = ['cpu']
 
-import sys, re, types
+import inspect
 import os
+import platform
+import re
 import subprocess
+import sys
+import types
 import warnings
-import platform
-import inspect
 
 is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE
 
diff --git a/numexpr/expressions.py b/numexpr/expressions.py
index 419d7dc..cab0247 100644
--- a/numexpr/expressions.py
+++ b/numexpr/expressions.py
@@ -35,6 +35,7 @@
 
 from numexpr import interpreter
 
+
 class Expression():
 
     def __getattr__(self, name):
@@ -185,11 +186,12 @@ def function(*args):
             return ConstantNode(func(*[x.value for x in args]))
         kind = commonKind(args)
         if kind in ('int', 'long'):
-            # Exception for following NumPy casting rules
-            #FIXME: this is not always desirable. The following
-            # functions which return ints (for int inputs) on numpy
-            # but not on numexpr: copy, abs, fmod, ones_like
-            kind = 'double'
+            if func.__name__ not in ('copy', 'abs', 'ones_like', 'round', 'sign'):
+                # except for these special functions (which return ints for int inputs in NumPy)
+                # just do a cast to double
+                # FIXME: 'fmod' outputs ints for NumPy when inputs are ints, but need to
+                # add new function signatures FUNC_LLL FUNC_III to support this
+                kind = 'double'
         else:
             # Apply regular casting rules
             if minkind and kind_rank.index(minkind) > kind_rank.index(kind):
@@ -269,10 +271,10 @@ def rtruediv_op(a, b):
 
 @ophelper
 def pow_op(a, b):
-    
+
     if isinstance(b, ConstantNode):
         x = b.value
-        if (    a.astKind in ('int', 'long') and 
+        if (    a.astKind in ('int', 'long') and
                 b.astKind in ('int', 'long') and x < 0) :
             raise ValueError(
                 'Integers to negative integer powers are not allowed.')
@@ -347,16 +349,26 @@ def multiply(x, y):
 
     'fmod': func(numpy.fmod, 'float'),
     'arctan2': func(numpy.arctan2, 'float'),
+    'hypot': func(numpy.hypot, 'double'),
+    'nextafter': func(numpy.nextafter, 'double'),
+    'copysign': func(numpy.copysign, 'double'),
+    'maximum': func(numpy.maximum, 'double'),
+    'minimum': func(numpy.minimum, 'double'),
+
 
     'log': func(numpy.log, 'float'),
     'log1p': func(numpy.log1p, 'float'),
     'log10': func(numpy.log10, 'float'),
+    'log2': func(numpy.log2, 'float'),
     'exp': func(numpy.exp, 'float'),
     'expm1': func(numpy.expm1, 'float'),
 
     'abs': func(numpy.absolute, 'float'),
     'ceil': func(numpy.ceil, 'float', 'double'),
     'floor': func(numpy.floor, 'float', 'double'),
+    'round': func(numpy.round, 'double'),
+    'trunc': func(numpy.trunc, 'double'),
+    'sign': func(numpy.sign, 'double'),
 
     'where': where_func,
 
@@ -365,6 +377,11 @@ def multiply(x, y):
     'complex': func(complex, 'complex'),
     'conj': func(numpy.conj, 'complex'),
 
+    'isnan': func(numpy.isnan, 'double'),
+    'isfinite': func(numpy.isfinite, 'double'),
+    'isinf': func(numpy.isinf, 'double'),
+    'signbit': func(numpy.signbit, 'double'),
+
     'sum': gen_reduce_axis_func('sum'),
     'prod': gen_reduce_axis_func('prod'),
     'min': gen_reduce_axis_func('min'),
@@ -436,6 +453,7 @@ def __bool__(self):
     __mul__ = __rmul__ = binop('mul')
     __truediv__ = truediv_op
     __rtruediv__ = rtruediv_op
+    __floordiv__ = binop("floordiv")
     __pow__ = pow_op
     __rpow__ = binop('pow', reversed=True)
     __mod__ = binop('mod')
@@ -446,10 +464,10 @@ def __bool__(self):
     __rshift__ = binop('rshift')
     __rrshift__ = binop('rshift', reversed=True)
 
-    # boolean operations
-
-    __and__ = binop('and', kind='bool')
-    __or__ = binop('or', kind='bool')
+    # bitwise or logical operations
+    __and__ = binop('and')
+    __or__ = binop('or')
+    __xor__ = binop('xor')
 
     __gt__ = binop('gt', kind='bool')
     __ge__ = binop('ge', kind='bool')
@@ -513,6 +531,9 @@ class OpNode(ExpressionNode):
     def __init__(self, opcode=None, args=None, kind=None):
         if (kind is None) and (args is not None):
             kind = commonKind(args)
+        if kind=='bool': # handle bool*bool and bool+bool cases
+            opcode = 'and' if opcode=='mul' else opcode
+            opcode = 'or' if opcode=='add' else opcode
         ExpressionNode.__init__(self, value=opcode, kind=kind, children=args)
 
 
@@ -520,4 +541,6 @@ class FuncNode(OpNode):
     def __init__(self, opcode=None, args=None, kind=None):
         if (kind is None) and (args is not None):
             kind = commonKind(args)
+        if opcode in ("isnan", "isfinite", "isinf", "signbit"): # bodge for boolean return functions
+            kind = 'bool'
         OpNode.__init__(self, opcode, args, kind)
diff --git a/numexpr/functions.hpp b/numexpr/functions.hpp
index 78e03f4..fc38fb8 100644
--- a/numexpr/functions.hpp
+++ b/numexpr/functions.hpp
@@ -30,12 +30,17 @@ FUNC_FF(FUNC_ARCTANH_FF, "arctanh_ff",  atanhf, atanhf2, vsAtanh)
 FUNC_FF(FUNC_LOG_FF,     "log_ff",      logf,   logf2,   vsLn)
 FUNC_FF(FUNC_LOG1P_FF,   "log1p_ff",    log1pf, log1pf2, vsLog1p)
 FUNC_FF(FUNC_LOG10_FF,   "log10_ff",    log10f, log10f2, vsLog10)
+FUNC_FF(FUNC_LOG2_FF,    "log2_ff",     log2f,  log2f2,  vsLog2)
 FUNC_FF(FUNC_EXP_FF,     "exp_ff",      expf,   expf2,   vsExp)
 FUNC_FF(FUNC_EXPM1_FF,   "expm1_ff",    expm1f, expm1f2, vsExpm1)
 FUNC_FF(FUNC_ABS_FF,     "absolute_ff", fabsf,  fabsf2,  vsAbs)
 FUNC_FF(FUNC_CONJ_FF,    "conjugate_ff",fconjf, fconjf2, vsConj)
 FUNC_FF(FUNC_CEIL_FF,    "ceil_ff",     ceilf,  ceilf2,  vsCeil)
 FUNC_FF(FUNC_FLOOR_FF,   "floor_ff",    floorf, floorf2, vsFloor)
+FUNC_FF(FUNC_TRUNC_FF,   "trunc_ff",    truncf, truncf2, vsTrunc)
+FUNC_FF(FUNC_SIGN_FF,   "sign_ff",    signf, signf2, vsSign)
+//rint rounds to nearest even integer, matching NumPy (round doesn't)
+FUNC_FF(FUNC_ROUND_FF,   "round_ff",    rintf,  rintf2,  vsRint)
 FUNC_FF(FUNC_FF_LAST,    NULL,          NULL,   NULL,    NULL)
 #ifdef ELIDE_FUNC_FF
 #undef ELIDE_FUNC_FF
@@ -48,6 +53,11 @@ FUNC_FF(FUNC_FF_LAST,    NULL,          NULL,   NULL,    NULL)
 #endif
 FUNC_FFF(FUNC_FMOD_FFF,    "fmod_fff",    fmodf,  fmodf2,  vsfmod)
 FUNC_FFF(FUNC_ARCTAN2_FFF, "arctan2_fff", atan2f, atan2f2, vsAtan2)
+FUNC_FFF(FUNC_HYPOT_FFF, "hypot_fff", hypotf, hypotf2, vsHypot)
+FUNC_FFF(FUNC_NEXTAFTER_FFF, "nextafter_fff", nextafterf, nextafterf2, vsNextAfter)
+FUNC_FFF(FUNC_COPYSIGN_FFF, "copysign_fff", copysignf, copysignf2, vsCopySign)
+FUNC_FFF(FUNC_MAXIMUM_FFF,   "maximum_fff",    fmaxf_, fmaxf2, vsFmax_)
+FUNC_FFF(FUNC_MINIMUM_FFF,   "minimum_fff",    fminf_, fminf2, vsFmin_)
 FUNC_FFF(FUNC_FFF_LAST,    NULL,          NULL,   NULL,    NULL)
 #ifdef ELIDE_FUNC_FFF
 #undef ELIDE_FUNC_FFF
@@ -74,24 +84,64 @@ FUNC_DD(FUNC_ARCTANH_DD, "arctanh_dd",  atanh, vdAtanh)
 FUNC_DD(FUNC_LOG_DD,     "log_dd",      log,   vdLn)
 FUNC_DD(FUNC_LOG1P_DD,   "log1p_dd",    log1p, vdLog1p)
 FUNC_DD(FUNC_LOG10_DD,   "log10_dd",    log10, vdLog10)
+FUNC_DD(FUNC_LOG2_DD,   "log2_dd",      log2,  vdLog2)
 FUNC_DD(FUNC_EXP_DD,     "exp_dd",      exp,   vdExp)
 FUNC_DD(FUNC_EXPM1_DD,   "expm1_dd",    expm1, vdExpm1)
 FUNC_DD(FUNC_ABS_DD,     "absolute_dd", fabs,  vdAbs)
 FUNC_DD(FUNC_CONJ_DD,    "conjugate_dd",fconj, vdConj)
 FUNC_DD(FUNC_CEIL_DD,    "ceil_dd",     ceil,  vdCeil)
 FUNC_DD(FUNC_FLOOR_DD,   "floor_dd",    floor, vdFloor)
+FUNC_DD(FUNC_TRUNC_DD,   "trunc_dd",    trunc, vdTrunc)
+FUNC_DD(FUNC_SIGN_DD,    "sign_dd",     sign,  vdSign)
+//rint rounds to nearest even integer, matching NumPy (round doesn't)
+FUNC_DD(FUNC_ROUND_DD,   "round_dd",    rint,  vdRint)
 FUNC_DD(FUNC_DD_LAST,    NULL,          NULL,  NULL)
 #ifdef ELIDE_FUNC_DD
 #undef ELIDE_FUNC_DD
 #undef FUNC_DD
 #endif
 
+// double -> boolean functions
+#ifndef FUNC_BD
+#define ELIDE_FUNC_BD
+#define FUNC_BD(...)
+#endif
+FUNC_BD(FUNC_ISNAN_BD,   "isnan_bd",    isnand, vdIsnan)
+FUNC_BD(FUNC_ISFINITE_BD, "isfinite_bd", isfinited, vdIsfinite)
+FUNC_BD(FUNC_ISINF_BD, "isinf_bd", isinfd, vdIsinf)
+FUNC_BD(FUNC_SIGNBIT_BD, "signbit_bd",  signbit, vdSignBit)
+FUNC_BD(FUNC_BD_LAST,    NULL,          NULL,  NULL)
+#ifdef ELIDE_FUNC_BD
+#undef ELIDE_FUNC_BD
+#undef FUNC_BD
+#endif
+
+// float -> boolean functions (C99 defines the same function for all types)
+#ifndef FUNC_BF
+#define ELIDE_FUNC_BF
+#define FUNC_BF(...)
+#endif // use wrappers as there is name collision with isnanf in std
+FUNC_BF(FUNC_ISNAN_BF,   "isnan_bf",    isnanf_,  isnanf2, vsIsnan)
+FUNC_BF(FUNC_ISFINITE_BF, "isfinite_bf", isfinitef_, isfinitef2, vsIsfinite)
+FUNC_BF(FUNC_ISINF_BF, "isinf_bf", isinff_, isinff2, vsIsinf)
+FUNC_BF(FUNC_SIGNBIT_BF, "signbit_bf", signbitf, signbitf2, vsSignBit)
+FUNC_BF(FUNC_BF_LAST,    NULL,            NULL,        NULL,    NULL)
+#ifdef ELIDE_FUNC_BF
+#undef ELIDE_FUNC_BF
+#undef FUNC_BF
+#endif
+
 #ifndef FUNC_DDD
 #define ELIDE_FUNC_DDD
 #define FUNC_DDD(...)
 #endif
 FUNC_DDD(FUNC_FMOD_DDD,    "fmod_ddd",    fmod,  vdfmod)
 FUNC_DDD(FUNC_ARCTAN2_DDD, "arctan2_ddd", atan2, vdAtan2)
+FUNC_DDD(FUNC_HYPOT_DDD, "hypot_ddd", hypot, vdHypot)
+FUNC_DDD(FUNC_NEXTAFTER_DDD, "nextafter_ddd", nextafter, vdNextAfter)
+FUNC_DDD(FUNC_COPYSIGN_DDD, "copysign_ddd", copysign, vdCopySign)
+FUNC_DDD(FUNC_MAXIMUM_DDD, "maximum_ddd",  fmaxd,  vdFmax_)
+FUNC_DDD(FUNC_MINIMUM_DDD, "minimum_ddd",  fmind,  vdFmin_)
 FUNC_DDD(FUNC_DDD_LAST,    NULL,          NULL,  NULL)
 #ifdef ELIDE_FUNC_DDD
 #undef ELIDE_FUNC_DDD
@@ -118,10 +168,14 @@ FUNC_CC(FUNC_ARCTANH_CC, "arctanh_cc",  nc_atanh,  vzAtanh)
 FUNC_CC(FUNC_LOG_CC,     "log_cc",      nc_log,    vzLn)
 FUNC_CC(FUNC_LOG1P_CC,   "log1p_cc",    nc_log1p,  vzLog1p)
 FUNC_CC(FUNC_LOG10_CC,   "log10_cc",    nc_log10,  vzLog10)
+FUNC_CC(FUNC_LOG2_CC,    "log2_cc",     nc_log2,   vzLog2)
 FUNC_CC(FUNC_EXP_CC,     "exp_cc",      nc_exp,    vzExp)
 FUNC_CC(FUNC_EXPM1_CC,   "expm1_cc",    nc_expm1,  vzExpm1)
 FUNC_CC(FUNC_ABS_CC,     "absolute_cc", nc_abs,    vzAbs_)
 FUNC_CC(FUNC_CONJ_CC,    "conjugate_cc",nc_conj,   vzConj)
+FUNC_CC(FUNC_SIGN_CC,   "sign_cc",    nc_sign,   vzSign)
+// rint rounds to nearest even integer, matches NumPy behaviour (round doesn't)
+FUNC_CC(FUNC_ROUND_CC,   "round_cc",    nc_rint,   vzRint)
 FUNC_CC(FUNC_CC_LAST,    NULL,          NULL,      NULL)
 #ifdef ELIDE_FUNC_CC
 #undef ELIDE_FUNC_CC
@@ -138,3 +192,44 @@ FUNC_CCC(FUNC_CCC_LAST,  NULL,      NULL)
 #undef ELIDE_FUNC_CCC
 #undef FUNC_CCC
 #endif
+
+// complex -> boolean functions
+#ifndef FUNC_BC
+#define ELIDE_FUNC_BC
+#define FUNC_BC(...)
+#endif // use wrappers as there is name collision with isnanf in std
+FUNC_BC(FUNC_ISNAN_BC,   "isnan_bc",    nc_isnan, vzIsnan)
+FUNC_BC(FUNC_ISFINITE_BC, "isfinite_bc", nc_isfinite, vzIsfinite)
+FUNC_BC(FUNC_ISINF_BC, "isinf_bc", nc_isinf, vzIsinf)
+FUNC_BC(FUNC_BC_LAST,    NULL,            NULL,        NULL)
+#ifdef ELIDE_FUNC_BC
+#undef ELIDE_FUNC_BC
+#undef FUNC_BC
+#endif
+
+// int -> int functions
+#ifndef FUNC_II
+#define ELIDE_FUNC_II
+#define FUNC_II(...)
+#endif
+FUNC_II(FUNC_SIGN_II,   "sign_ii",  signi, viSign)
+FUNC_II(FUNC_ROUND_II,  "round_ii", rinti, viRint)
+FUNC_II(FUNC_ABS_II,  "absolute_ii", fabsi, viFabs)
+FUNC_II(FUNC_II_LAST,  NULL,      NULL, NULL)
+#ifdef ELIDE_FUNC_II
+#undef ELIDE_FUNC_II
+#undef FUNC_II
+#endif
+
+#ifndef FUNC_LL
+#define ELIDE_FUNC_LL
+#define FUNC_LL(...)
+#endif
+FUNC_LL(FUNC_SIGN_LL,  "sign_ll",   signl,   vlSign)
+FUNC_LL(FUNC_ROUND_LL, "round_ll",  rintl,   vlRint)
+FUNC_LL(FUNC_ABS_LL,  "absolute_ll", fabsl, vlFabs)
+FUNC_LL(FUNC_LL_LAST,  NULL,      NULL, NULL)
+#ifdef ELIDE_FUNC_LL
+#undef ELIDE_FUNC_LL
+#undef FUNC_LL
+#endif
diff --git a/numexpr/interp_body.cpp b/numexpr/interp_body.cpp
index 09b9da9..743f8ab 100644
--- a/numexpr/interp_body.cpp
+++ b/numexpr/interp_body.cpp
@@ -7,13 +7,13 @@
   See LICENSE.txt for details about copyright and rights to use.
 **********************************************************************/
 
-// WARNING: This file is included multiple times in `interpreter.cpp`. It is 
-// essentially a very macro-heavy jump table. Interpretation is best done by 
+// WARNING: This file is included multiple times in `interpreter.cpp`. It is
+// essentially a very macro-heavy jump table. Interpretation is best done by
 // the developer by expanding all macros (e.g. adding `'-E'` to the `extra_cflags`
 // argument in `setup.py` and looking at the resulting `interpreter.cpp`.
 //
-// Changes made to this file will not be recognized by the compile, so the developer 
-// must make a trivial change is made to `interpreter.cpp` or delete the `build/` 
+// Changes made to this file will not be recognized by the compile, so the developer
+// must make a trivial change is made to `interpreter.cpp` or delete the `build/`
 // directory in-between each build.
 {
 #define VEC_LOOP(expr) for(j = 0; j < BLOCK_SIZE; j++) {       \
@@ -220,6 +220,7 @@
         case OP_INVERT_BB: VEC_ARG1(b_dest = !b1);
         case OP_AND_BBB: VEC_ARG2(b_dest = (b1 && b2));
         case OP_OR_BBB: VEC_ARG2(b_dest = (b1 || b2));
+        case OP_XOR_BBB: VEC_ARG2(b_dest = (b1 || b2) && !(b1 && b2) );
 
         case OP_EQ_BBB: VEC_ARG2(b_dest = (b1 == b2));
         case OP_NE_BBB: VEC_ARG2(b_dest = (b1 != b2));
@@ -264,10 +265,16 @@
         case OP_DIV_III: VEC_ARG2(i_dest = i2 ? (i1 / i2) : 0);
         case OP_POW_III: VEC_ARG2(i_dest = (i2 < 0) ? (1 / i1) : (int)pow((double)i1, i2));
         case OP_MOD_III: VEC_ARG2(i_dest = i2 == 0 ? 0 :((i1 % i2) + i2) % i2);
+        case OP_FLOORDIV_III: VEC_ARG2(i_dest = i2 ? (i1 / i2) - ((i1 % i2 != 0) && (i1 < 0 != i2 < 0)) : 0);
         case OP_LSHIFT_III: VEC_ARG2(i_dest = i1 << i2);
         case OP_RSHIFT_III: VEC_ARG2(i_dest = i1 >> i2);
 
         case OP_WHERE_IBII: VEC_ARG3(i_dest = b1 ? i2 : i3);
+        //Bitwise ops
+        case OP_INVERT_II: VEC_ARG1(i_dest = ~i1);
+        case OP_AND_III: VEC_ARG2(i_dest = (i1 & i2));
+        case OP_OR_III: VEC_ARG2(i_dest = (i1 | i2));
+        case OP_XOR_III: VEC_ARG2(i_dest = (i1 ^ i2));
 
         /* Long */
         case OP_CAST_LI: VEC_ARG1(l_dest = (long long)(i1));
@@ -284,10 +291,16 @@
         case OP_POW_LLL: VEC_ARG2(l_dest = (l2 < 0) ? (1 / l1) : (long long)llround(pow((long double)l1, (long double)l2)));
 #endif
         case OP_MOD_LLL: VEC_ARG2(l_dest = l2 == 0 ? 0 :((l1 % l2) + l2) % l2);
+        case OP_FLOORDIV_LLL: VEC_ARG2(l_dest = l2 ? (l1 / l2) - ((l1 % l2 != 0) && (l1 < 0 != l2 < 0)): 0);
         case OP_LSHIFT_LLL: VEC_ARG2(l_dest = l1 << l2);
         case OP_RSHIFT_LLL: VEC_ARG2(l_dest = l1 >> l2);
 
         case OP_WHERE_LBLL: VEC_ARG3(l_dest = b1 ? l2 : l3);
+        //Bitwise ops
+        case OP_INVERT_LL: VEC_ARG1(l_dest = ~l1);
+        case OP_AND_LLL: VEC_ARG2(l_dest = (l1 & l2));
+        case OP_OR_LLL: VEC_ARG2(l_dest = (l1 | l2));
+        case OP_XOR_LLL: VEC_ARG2(l_dest = (l1 ^ l2));
 
         /* Float */
         case OP_CAST_FI: VEC_ARG1(f_dest = (float)(i1));
@@ -313,6 +326,7 @@
             VEC_ARG2(f_dest = powf(f1, f2));
 #endif
         case OP_MOD_FFF: VEC_ARG2(f_dest = f1 - floorf(f1/f2) * f2);
+        case OP_FLOORDIV_FFF: VEC_ARG2(f_dest = floorf(f1/f2));
 
         case OP_SQRT_FF:
 #ifdef USE_VML
@@ -364,6 +378,7 @@
             VEC_ARG2(d_dest = pow(d1, d2));
 #endif
         case OP_MOD_DDD: VEC_ARG2(d_dest = d1 - floor(d1/d2) * d2);
+        case OP_FLOORDIV_DDD: VEC_ARG2(d_dest = floor(d1/d2));
 
         case OP_SQRT_DD:
 #ifdef USE_VML
@@ -451,6 +466,50 @@
         case OP_COMPLEX_CDD: VEC_ARG2(cr_dest = d1;
                                       ci_dest = d2);
 
+        // Boolean return types
+        case OP_FUNC_BFN:
+#ifdef USE_VML
+            VEC_ARG1_VML(functions_bf_vml[arg2](BLOCK_SIZE,
+                                                (float*)x1, (bool*)dest));
+#else
+            VEC_ARG1(b_dest = functions_bf[arg2](f1));
+#endif
+
+
+        case OP_FUNC_BDN:
+#ifdef USE_VML
+            VEC_ARG1_VML(functions_bd_vml[arg2](BLOCK_SIZE,
+                                                (double*)x1, (bool*)dest));
+#else
+            VEC_ARG1(b_dest = functions_bd[arg2](d1));
+#endif
+
+        case OP_FUNC_BCN:
+#ifdef USE_VML
+            VEC_ARG1_VML(functions_bc_vml[arg2](BLOCK_SIZE,
+                        (const MKL_Complex16*)x1, (bool*)dest));
+#else
+            VEC_ARG1(ca.real(c1r);
+                     ca.imag(c1i);
+                     b_dest = functions_bc[arg2](&ca));
+#endif
+
+        /* Integer return types */
+         case OP_FUNC_IIN:
+#ifdef USE_VML
+            VEC_ARG1_VML(functions_ii_vml[arg2](BLOCK_SIZE,
+                                                (int*)x1, (int*)dest));
+#else
+            VEC_ARG1(i_dest = functions_ii[arg2](i1));
+#endif
+         case OP_FUNC_LLN:
+#ifdef USE_VML
+            VEC_ARG1_VML(functions_ll_vml[arg2](BLOCK_SIZE,
+                                                (long*)x1, (long*)dest));
+#else
+            VEC_ARG1(l_dest = functions_ll[arg2](l1));
+#endif
+
         /* Reductions */
         case OP_SUM_IIN: VEC_ARG1(i_reduce += i1);
         case OP_SUM_LLN: VEC_ARG1(l_reduce += l1);
diff --git a/numexpr/interpreter.cpp b/numexpr/interpreter.cpp
index edebd71..409ad3d 100644
--- a/numexpr/interpreter.cpp
+++ b/numexpr/interpreter.cpp
@@ -18,6 +18,7 @@
 #include "complex_functions.hpp"
 #include "interpreter.hpp"
 #include "numexpr_object.hpp"
+#include "bespoke_functions.hpp"
 
 #ifdef _MSC_VER
 /* Some missing symbols and functions for Win */
@@ -25,7 +26,7 @@
 #define fmin min
 #define NE_INFINITY (DBL_MAX+DBL_MAX)
 #define NE_NAN (INFINITY-INFINITY)
-#else 
+#else
 #define NE_INFINITY INFINITY
 #define NE_NAN NAN
 #endif
@@ -129,6 +130,9 @@ op_signature(int op, unsigned int n) {
 typedef float (*FuncFFPtr)(float);
 
 #ifdef _WIN32
+inline float signf2(float x) { // needed to wait for bespoke_functions to be loaded
+    return signf(x);
+}
 FuncFFPtr functions_ff[] = {
 #define FUNC_FF(fop, s, f, f_win32, ...) f_win32,
 #include "functions.hpp"
@@ -142,17 +146,6 @@ FuncFFPtr functions_ff[] = {
 };
 #endif
 
-#ifdef USE_VML
-/* Fake vsConj function just for casting purposes inside numexpr */
-static void vsConj(MKL_INT n, const float* x1, float* dest)
-{
-    MKL_INT j;
-    for (j=0; j<n; j++) {
-        dest[j] = x1[j];
-    };
-};
-#endif
-
 #ifdef USE_VML
 typedef void (*FuncFFPtr_vml)(MKL_INT, const float*, float*);
 FuncFFPtr_vml functions_ff_vml[] = {
@@ -179,15 +172,6 @@ FuncFFFPtr functions_fff[] = {
 #endif
 
 #ifdef USE_VML
-/* fmod not available in VML */
-static void vsfmod(MKL_INT n, const float* x1, const float* x2, float* dest)
-{
-    MKL_INT j;
-    for(j=0; j < n; j++) {
-    dest[j] = fmod(x1[j], x2[j]);
-    };
-};
-
 typedef void (*FuncFFFPtr_vml)(MKL_INT, const float*, const float*, float*);
 FuncFFFPtr_vml functions_fff_vml[] = {
 #define FUNC_FFF(fop, s, f, f_win32, f_vml) f_vml,
@@ -204,14 +188,61 @@ FuncDDPtr functions_dd[] = {
 #undef FUNC_DD
 };
 
+// Boolean output functions
+typedef bool (*FuncBFPtr)(float);
+#ifdef _WIN32
+FuncBFPtr functions_bf[] = {
+#define FUNC_BF(fop, s, f, f_win32, ...) f_win32,
+#include "functions.hpp"
+#undef FUNC_BF
+};
+#else
+FuncBFPtr functions_bf[] = {
+#define FUNC_BF(fop, s, f, ...) f,
+#include "functions.hpp"
+#undef FUNC_BF
+};
+#endif
+
 #ifdef USE_VML
-/* Fake vdConj function just for casting purposes inside numexpr */
-static void vdConj(MKL_INT n, const double* x1, double* dest)
-{
-    MKL_INT j;
-    for (j=0; j<n; j++) {
-        dest[j] = x1[j];
-    };
+typedef void (*FuncBFPtr_vml)(MKL_INT, const float*, bool*);
+FuncBFPtr_vml functions_bf_vml[] = {
+#define FUNC_BF(fop, s, f, f_win32, f_vml) f_vml,
+#include "functions.hpp"
+#undef FUNC_BF
+};
+#endif
+
+typedef bool (*FuncBDPtr)(double);
+FuncBDPtr functions_bd[] = {
+#define FUNC_BD(fop, s, f, ...) f,
+#include "functions.hpp"
+#undef FUNC_BD
+};
+
+#ifdef USE_VML
+typedef void (*FuncBDPtr_vml)(MKL_INT, const double*, bool*);
+FuncBDPtr_vml functions_bd_vml[] = {
+#define FUNC_BD(fop, s, f, f_vml) f_vml,
+#include "functions.hpp"
+#undef FUNC_BD
+};
+#endif
+
+typedef bool (*FuncBCPtr)(std::complex<double>*);
+FuncBCPtr functions_bc[] = {
+#define FUNC_BC(fop, s, f, ...) f,
+#include "functions.hpp"
+#undef FUNC_BC
+};
+
+
+#ifdef USE_VML
+typedef void (*FuncBCPtr_vml)(MKL_INT, const MKL_Complex16[], bool*);
+FuncBCPtr_vml functions_bc_vml[] = {
+#define FUNC_BC(fop, s, f, f_vml) f_vml,
+#include "functions.hpp"
+#undef FUNC_BC
 };
 #endif
 
@@ -233,15 +264,6 @@ FuncDDDPtr functions_ddd[] = {
 };
 
 #ifdef USE_VML
-/* fmod not available in VML */
-static void vdfmod(MKL_INT n, const double* x1, const double* x2, double* dest)
-{
-    MKL_INT j;
-    for(j=0; j < n; j++) {
-    dest[j] = fmod(x1[j], x2[j]);
-    };
-};
-
 typedef void (*FuncDDDPtr_vml)(MKL_INT, const double*, const double*, double*);
 FuncDDDPtr_vml functions_ddd_vml[] = {
 #define FUNC_DDD(fop, s, f, f_vml) f_vml,
@@ -261,38 +283,7 @@ FuncCCPtr functions_cc[] = {
 };
 
 #ifdef USE_VML
-/* complex expm1 not available in VML */
-static void vzExpm1(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
-{
-    MKL_INT j;
-    vzExp(n, x1, dest);
-    for (j=0; j<n; j++) {
-        dest[j].real -= 1.0;
-    };
-};
-
-static void vzLog1p(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
-{
-    MKL_INT j;
-    for (j=0; j<n; j++) {
-        dest[j].real = x1[j].real + 1;
-        dest[j].imag = x1[j].imag;
-    };
-    vzLn(n, dest, dest);
-};
-
-/* Use this instead of native vzAbs in VML as it seems to work badly */
-static void vzAbs_(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest)
-{
-    MKL_INT j;
-    for (j=0; j<n; j++) {
-        dest[j].real = sqrt(x1[j].real*x1[j].real + x1[j].imag*x1[j].imag);
-    dest[j].imag = 0;
-    };
-};
-
 typedef void (*FuncCCPtr_vml)(MKL_INT, const MKL_Complex16[], MKL_Complex16[]);
-
 FuncCCPtr_vml functions_cc_vml[] = {
 #define FUNC_CC(fop, s, f, f_vml) f_vml,
 #include "functions.hpp"
@@ -309,14 +300,47 @@ FuncCCCPtr functions_ccc[] = {
 #undef FUNC_CCC
 };
 
+/* integer return types*/
+typedef int (*FuncIIPtr)(int);
+FuncIIPtr functions_ii[] = {
+#define FUNC_II(fop, s, f, ...) f,
+#include "functions.hpp"
+#undef FUNC_II
+};
+
+#ifdef USE_VML
+typedef void (*FuncIIPtr_vml)(MKL_INT, const int*, int*);
+FuncIIPtr_vml functions_ii_vml[] = {
+#define FUNC_II(fop, s, f, f_vml) f_vml,
+#include "functions.hpp"
+#undef FUNC_II
+};
+#endif
+
+typedef long (*FuncLLPtr)(long);
+FuncLLPtr functions_ll[] = {
+#define FUNC_LL(fop, s, f, ...) f,
+#include "functions.hpp"
+#undef FUNC_LL
+};
+
+#ifdef USE_VML
+typedef void (*FuncLLPtr_vml)(MKL_INT, const long*, long*);
+FuncLLPtr_vml functions_ll_vml[] = {
+#define FUNC_LL(fop, s, f, f_vml) f_vml,
+#include "functions.hpp"
+#undef FUNC_LL
+};
+#endif
+
 
 char
 get_return_sig(PyObject* program)
-{
+{ // use unsigned chars to match OPCODE table and allow OPCODE > 127
     int sig;
-    char last_opcode;
+    unsigned char last_opcode;
     Py_ssize_t end = PyBytes_Size(program);
-    char *program_str = PyBytes_AS_STRING(program);
+    unsigned char *program_str = (unsigned char *)PyBytes_AS_STRING(program);
 
     do {
         end -= 4;
@@ -464,7 +488,38 @@ check_program(NumExprObject *self)
                         PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc);
                         return -1;
                     }
-                } else if (op >= OP_REDUCTION) {
+                }
+                  else if (op == OP_FUNC_BDN) {
+                    if (arg < 0 || arg >= FUNC_BD_LAST) {
+                        PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc);
+                        return -1;
+                    }
+                }
+                  else if (op == OP_FUNC_BFN) {
+                    if (arg < 0 || arg >= FUNC_BF_LAST) {
+                        PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc);
+                        return -1;
+                    }
+                }
+                  else if (op == OP_FUNC_BCN) {
+                    if (arg < 0 || arg >= FUNC_BC_LAST) {
+                        PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc);
+                        return -1;
+                    }
+                }
+                  else if (op == OP_FUNC_IIN) {
+                    if (arg < 0 || arg >= FUNC_II_LAST) {
+                        PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc);
+                        return -1;
+                    }
+                }
+                    else if (op == OP_FUNC_LLN) {
+                    if (arg < 0 || arg >= FUNC_LL_LAST) {
+                        PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc);
+                        return -1;
+                    }
+                }
+                  else if (op >= OP_REDUCTION) {
                     ;
                 } else {
                     PyErr_Format(PyExc_RuntimeError, "invalid program: internal checker error processing %i", argloc);
@@ -556,7 +611,7 @@ stringcontains(const char *haystack_start, const char *needle_start, npy_intp ma
 
     size_t si = 0;
     size_t min_len = min(needle_len, haystack_len);
-    while (*haystack && *needle && si < min_len)
+    while (si < min_len && *haystack && *needle)
     {
       ok &= *haystack++ == *needle++;
       si++;
@@ -573,7 +628,7 @@ stringcontains(const char *haystack_start, const char *needle_start, npy_intp ma
     }
 
     /* calc haystack length */
-    while (*haystack && si < haystack_len) {
+    while (si < haystack_len && *haystack) {
         haystack++;
         si++;
     }
@@ -652,6 +707,7 @@ int vm_engine_iter_task(NpyIter *iter, npy_intp *memsteps,
 
     /* Then finish off the rest */
     if (block_size > 0) do {
+        block_size = *size_ptr;
 #define REDUCTION_INNER_LOOP
 #define BLOCK_SIZE block_size
 #include "interp_body.cpp"
@@ -698,6 +754,7 @@ vm_engine_iter_outer_reduce_task(NpyIter *iter, npy_intp *memsteps,
 
     /* Then finish off the rest */
     if (block_size > 0) do {
+        block_size = *size_ptr;
 #define BLOCK_SIZE block_size
 #define NO_OUTPUT_BUFFERING // Because it's a reduction
 #include "interp_body.cpp"
@@ -1260,7 +1317,7 @@ NumExpr_run(NumExprObject *self, PyObject *args, PyObject *kwds)
     PyArrayObject *singleton;
     bool writeback;
     // NOTE: cannot assign on declaration due to `goto` statements
-    singleton = NULL; 
+    singleton = NULL;
     writeback = false;
     if (n_inputs == 0) {
         char retsig = get_return_sig(self->program);
@@ -1319,10 +1376,10 @@ NumExpr_run(NumExprObject *self, PyObject *args, PyObject *kwds)
     /* Allocate the iterator or nested iterators */
     if (reduction_size < 0 || full_reduction) {
         /* When there's no reduction, reduction_size is 1 as well */
-        // RAM: in issue #277 this was also the case for reductions on arrays 
-        // with axis=0 having singleton dimension, i.e. such ops were interpreted 
-        // as full_reductions when they weren't in Numpy. As such, the default 
-        // reduction_size is now -1 and we add the flag for full_reduction, 
+        // RAM: in issue #277 this was also the case for reductions on arrays
+        // with axis=0 having singleton dimension, i.e. such ops were interpreted
+        // as full_reductions when they weren't in Numpy. As such, the default
+        // reduction_size is now -1 and we add the flag for full_reduction,
         // e.g. ne.evaluate("sum(a)")"
         iter = NpyIter_AdvancedNew(n_inputs+1, operands,
                             NPY_ITER_BUFFERED|
diff --git a/numexpr/interpreter.hpp b/numexpr/interpreter.hpp
index f9ac1c7..3ec09bb 100644
--- a/numexpr/interpreter.hpp
+++ b/numexpr/interpreter.hpp
@@ -18,6 +18,12 @@ enum FuncFFCodes {
 #undef FUNC_FF
 };
 
+enum FuncBFCodes {
+#define FUNC_BF(fop, ...) fop,
+#include "functions.hpp"
+#undef FUNC_BF
+};
+
 enum FuncFFFCodes {
 #define FUNC_FFF(fop, ...) fop,
 #include "functions.hpp"
@@ -30,6 +36,30 @@ enum FuncDDCodes {
 #undef FUNC_DD
 };
 
+enum FuncBDCodes {
+#define FUNC_BD(fop, ...) fop,
+#include "functions.hpp"
+#undef FUNC_BD
+};
+
+enum FuncBCCodes {
+#define FUNC_BC(fop, ...) fop,
+#include "functions.hpp"
+#undef FUNC_BC
+};
+
+enum FuncIICodes {
+#define FUNC_II(fop, ...) fop,
+#include "functions.hpp"
+#undef FUNC_II
+};
+
+enum FuncLLCodes {
+#define FUNC_LL(fop, ...) fop,
+#include "functions.hpp"
+#undef FUNC_LL
+};
+
 enum FuncDDDCodes {
 #define FUNC_DDD(fop, ...) fop,
 #include "functions.hpp"
@@ -75,7 +105,7 @@ struct thread_data {
     int ret_code;
     int *pc_error;
     char **errmsg;
-    // NOTE: memsteps, iter, and reduce_iter are arrays, they MUST be allocated 
+    // NOTE: memsteps, iter, and reduce_iter are arrays, they MUST be allocated
     // to length `global_max_threads` before module load.
     // One memsteps array per thread
     // npy_intp *memsteps[MAX_THREADS];
diff --git a/numexpr/module.cpp b/numexpr/module.cpp
index 66b5b77..67629bd 100644
--- a/numexpr/module.cpp
+++ b/numexpr/module.cpp
@@ -51,7 +51,9 @@ void *th_worker(void *tidptr)
     while (1) {
 
         /* Sentinels have to be initialised yet */
-        gs.init_sentinels_done = 0;
+        if (tid == 0) {
+            gs.init_sentinels_done = 0;
+        }
 
         /* Meeting point for all threads (wait for initialization) */
         pthread_mutex_lock(&gs.count_threads_mutex);
@@ -380,7 +382,7 @@ Py_set_num_threads(PyObject *self, PyObject *args)
 }
 
 static PyObject*
-Py_get_num_threads(PyObject *self, PyObject *args) 
+Py_get_num_threads(PyObject *self, PyObject *args)
 {
     int n_thread;
     n_thread = gs.nthreads;
@@ -477,6 +479,10 @@ PyInit_interpreter(void) {
     if (m == NULL)
         INITERROR;
 
+    #ifdef Py_GIL_DISABLED
+        PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
+    #endif
+
     Py_INCREF(&NumExprType);
     PyModule_AddObject(m, "NumExpr", (PyObject *)&NumExprType);
 
@@ -500,14 +506,23 @@ PyInit_interpreter(void) {
 #define FUNC_FF(name, sname, ...)  add_func(name, sname);
 #define FUNC_FFF(name, sname, ...) add_func(name, sname);
 #define FUNC_DD(name, sname, ...)  add_func(name, sname);
+#define FUNC_BF(name, sname, ...)  add_func(name, sname);
+#define FUNC_BD(name, sname, ...)  add_func(name, sname);
+#define FUNC_BC(name, sname, ...)  add_func(name, sname);
 #define FUNC_DDD(name, sname, ...) add_func(name, sname);
 #define FUNC_CC(name, sname, ...)  add_func(name, sname);
 #define FUNC_CCC(name, sname, ...) add_func(name, sname);
+#define FUNC_II(name, sname, ...) add_func(name, sname);
+#define FUNC_LL(name, sname, ...) add_func(name, sname);
 #include "functions.hpp"
+#undef FUNC_LL
+#undef FUNC_II
 #undef FUNC_CCC
 #undef FUNC_CC
 #undef FUNC_DDD
-#undef FUNC_DD
+#undef FUNC_BC
+#undef FUNC_BD
+#undef FUNC_BF
 #undef FUNC_DD
 #undef FUNC_FFF
 #undef FUNC_FF
diff --git a/numexpr/module.hpp b/numexpr/module.hpp
index cf7b571..079a17f 100644
--- a/numexpr/module.hpp
+++ b/numexpr/module.hpp
@@ -23,7 +23,7 @@ struct global_state {
     int end_threads;                 /* should exisiting threads end? */
     // pthread_t threads[MAX_THREADS];  /* opaque structure for threads */
     // int tids[MAX_THREADS];           /* ID per each thread */
-    /* NOTE: threads and tids are arrays, they MUST be allocated to length 
+    /* NOTE: threads and tids are arrays, they MUST be allocated to length
        `global_max_threads` before module load. */
     pthread_t *threads;              /* opaque structure for threads */
     int *tids;                       /* ID per each thread */
@@ -36,7 +36,7 @@ struct global_state {
     /* Synchronization variables for threadpool state */
     pthread_mutex_t count_mutex;
     int count_threads;
-    int barrier_passed;         /* indicates if the thread pool's thread barrier 
+    int barrier_passed;         /* indicates if the thread pool's thread barrier
                                    is unlocked and ready for the VM to process.*/
     pthread_mutex_t count_threads_mutex;
     pthread_cond_t count_threads_cv;
diff --git a/numexpr/msvc_function_stubs.hpp b/numexpr/msvc_function_stubs.hpp
index 0c28f22..8e4d722 100644
--- a/numexpr/msvc_function_stubs.hpp
+++ b/numexpr/msvc_function_stubs.hpp
@@ -1,3 +1,5 @@
+#include <float.h>  // for _finite, _isnan on MSVC
+
 #ifndef NUMEXPR_MSVC_FUNCTION_STUBS_HPP
 #define NUMEXPR_MSVC_FUNCTION_STUBS_HPP
 
@@ -14,37 +16,65 @@
    definitions in <math.h> are actually #define'd and are not usable
    as function pointers :-/ */
 
+/* Due to casting problems (normally return ints not bools, easiest to define
+non-overloaded wrappers for these functions) */
+// MSVC version: use global ::isfinite / ::isnan
+inline bool isfinitef_(float x) { return !!::_finite(x); }   // MSVC has _finite
+inline bool isnanf_(float x)    { return !!::_isnan(x); }    // MSVC has _isnan
+inline bool isfinited(double x) { return !!::_finite(x); }
+inline bool isnand(double x)    { return !!::_isnan(x); }
+inline bool isinfd(double x) { return !!::isinf(x); }
+inline bool isinff_(float x)    { return !!::isinf(x); }
+
+// To handle overloading of fmax/fmin in cmath and match NumPy behaviour for NaNs
+inline double fmaxd(double x, double y)    { return (isnand(x) | isnand(y))? NAN : fmax(x, y); }
+inline double fmind(double x, double y)    { return (isnand(x) | isnand(y))? NAN : fmin(x, y); }
+
+
 #if _MSC_VER < 1400  // 1310 == MSVC 7.1
-/* Apparently, single precision functions are not included in MSVC 7.1 */
-
-#define sqrtf(x)    ((float)sqrt((double)(x)))
-#define sinf(x)    ((float)sin((double)(x)))
-#define cosf(x)    ((float)cos((double)(x)))
-#define tanf(x)    ((float)tan((double)(x)))
-#define asinf(x)    ((float)asin((double)(x)))
-#define acosf(x)    ((float)acos((double)(x)))
-#define atanf(x)    ((float)atan((double)(x)))
-#define sinhf(x)    ((float)sinh((double)(x)))
-#define coshf(x)    ((float)cosh((double)(x)))
-#define tanhf(x)    ((float)tanh((double)(x)))
-#define asinhf(x)    ((float)asinh((double)(x)))
-#define acoshf(x)    ((float)acosh((double)(x)))
-#define atanhf(x)    ((float)atanh((double)(x)))
-#define logf(x)    ((float)log((double)(x)))
-#define log1pf(x)    ((float)log1p((double)(x)))
-#define log10f(x)    ((float)log10((double)(x)))
-#define expf(x)    ((float)exp((double)(x)))
-#define expm1f(x)    ((float)expm1((double)(x)))
-#define fabsf(x)    ((float)fabs((double)(x)))
-#define fmodf(x, y)    ((float)fmod((double)(x), (double)(y)))
-#define atan2f(x, y)    ((float)atan2((double)(x), (double)(y)))
-#define ceilf(x)    ((float)ceil((double)(x)))
-
-/* The next are directly called from interp_body.cpp */
-#define powf(x, y)    ((float)pow((double)(x), (double)(y)))
-#define floorf(x)    ((float)floor((double)(x)))
-
-#endif  // _MSC_VER < 1400
+    /* Apparently, single precision functions are not included in MSVC 7.1 */
+
+    #define sqrtf(x)    ((float)sqrt((double)(x)))
+    #define sinf(x)    ((float)sin((double)(x)))
+    #define cosf(x)    ((float)cos((double)(x)))
+    #define tanf(x)    ((float)tan((double)(x)))
+    #define asinf(x)    ((float)asin((double)(x)))
+    #define acosf(x)    ((float)acos((double)(x)))
+    #define atanf(x)    ((float)atan((double)(x)))
+    #define sinhf(x)    ((float)sinh((double)(x)))
+    #define coshf(x)    ((float)cosh((double)(x)))
+    #define tanhf(x)    ((float)tanh((double)(x)))
+    #define asinhf(x)    ((float)asinh((double)(x)))
+    #define acoshf(x)    ((float)acosh((double)(x)))
+    #define atanhf(x)    ((float)atanh((double)(x)))
+    #define logf(x)    ((float)log((double)(x)))
+    #define log1pf(x)    ((float)log1p((double)(x)))
+    #define log10f(x)    ((float)log10((double)(x)))
+    #define log2f(x)    ((float)log2((double)(x)))
+    #define expf(x)    ((float)exp((double)(x)))
+    #define expm1f(x)    ((float)expm1((double)(x)))
+    #define fabsf(x)    ((float)fabs((double)(x)))
+    #define fmodf(x, y)    ((float)fmod((double)(x), (double)(y)))
+    #define atan2f(x, y)    ((float)atan2((double)(x), (double)(y)))
+    #define hypotf(x, y)    ((float)hypot((double)(x), (double)(y)))
+    #define copysignf(x, y)    ((float)copysign((double)(x), (double)(y)))
+    #define nextafterf(x, y)    ((float)nextafter((double)(x), (double)(y)))
+    #define ceilf(x)    ((float)ceil((double)(x)))
+    #define hypotf(x)    ((float)hypot((double)(x)))
+    #define rintf(x)    ((float)rint((double)(x)))
+    #define truncf(x)    ((float)trunc((double)(x)))
+
+
+    /* The next are directly called from interp_body.cpp */
+    #define powf(x, y)    ((float)pow((double)(x), (double)(y)))
+    #define floorf(x)    ((float)floor((double)(x)))
+
+    #define fmaxf_(x, y)    ((float)fmaxd((double)(x), (double)(y))) // define fmaxf_ since fmaxf doesn't exist for early MSVC
+    #define fminf_(x, y)    ((float)fmind((double)(x), (double)(y)))
+#else
+    inline float fmaxf_(float x, float y)    { return (isnanf_(x) | isnanf_(y))? NAN : fmaxf(x, y); }
+    inline float fminf_(float x, float y)    { return (isnanf_(x) | isnanf_(y))? NAN : fminf(x, y); }
+#endif // _MSC_VER < 1400
 
 
 /* Now the actual stubs */
@@ -113,6 +143,10 @@ inline float log10f2(float x) {
     return log10f(x);
 }
 
+inline float log2f2(float x) {
+    return log2f(x);
+}
+
 inline float expf2(float x) {
     return expf(x);
 }
@@ -133,6 +167,41 @@ inline float atan2f2(float x, float y) {
     return atan2f(x, y);
 }
 
+inline float hypotf2(float x, float y) {
+    return hypotf(x, y);
+}
+
+inline float nextafterf2(float x, float y) {
+    return nextafterf(x, y);
+}
+
+inline float copysignf2(float x, float y) {
+    return copysignf(x, y);
+}
+
+inline float fmaxf2(float x, float y) {
+    return fmaxf_(x, y);
+}
+
+inline float fminf2(float x, float y) {
+    return fminf_(x, y);
+}
+
+
+// Boolean output functions
+inline bool isnanf2(float x) {
+    return isnanf_(x);
+}
+
+inline bool isfinitef2(float x) {
+    return isfinitef_(x);
+}
+
+inline bool isinff2(float x) {
+    return isinff_(x);
+}
+
+
 // Needed for allowing the internal casting in numexpr machinery for
 // conjugate operations
 inline float fconjf2(float x) {
@@ -147,4 +216,16 @@ inline float floorf2(float x) {
     return floorf(x);
 }
 
+inline float rintf2(float x) {
+    return rintf(x);
+}
+
+inline float truncf2(float x) {
+    return truncf(x);
+}
+
+inline bool signbitf2(float x) {
+    return signbitf(x);
+}
+
 #endif // NUMEXPR_MSVC_FUNCTION_STUBS_HPP
diff --git a/numexpr/necompiler.py b/numexpr/necompiler.py
index a693c4d..96c66f6 100644
--- a/numexpr/necompiler.py
+++ b/numexpr/necompiler.py
@@ -8,17 +8,18 @@
 #  rights to use.
 ####################################################################
 
-from typing import Optional, Dict
 import __future__
-import sys
+
 import os
-import threading
 import re
+import sys
+import threading
+from typing import Dict, Optional
 
 import numpy
 
 is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE
-from numexpr import interpreter, expressions, use_vml
+from numexpr import expressions, interpreter, use_vml
 from numexpr.utils import CacheDict, ContextDict
 
 # Declare a double type that does not exist in Python space
@@ -28,7 +29,7 @@
 int_ = numpy.int32
 long_ = numpy.int64
 
-typecode_to_kind = {'b': 'bool', 'i': 'int', 'l': 'long', 'f': 'float', 'd': 'double', 
+typecode_to_kind = {'b': 'bool', 'i': 'int', 'l': 'long', 'f': 'float', 'd': 'double',
                     'c': 'complex', 'n': 'none', 's': 'str'}
 kind_to_typecode = {'bool': 'b', 'int': 'i', 'long': 'l', 'float': 'f', 'double': 'd',
                     'complex': 'c', 'bytes': 's', 'str': 's', 'none': 'n'}
@@ -61,6 +62,7 @@
     "log",
     "log1p",
     "log10",
+    "log2",
     "exp",
     "expm1",
     "absolute",
@@ -68,7 +70,19 @@
     "arctan2",
     "fmod",
     "ceil",
-    "floor"
+    "floor",
+    "isnan",
+    "isfinite",
+    "isinf",
+    "hypot",
+    "round",
+    "trunc",
+    "nextafter",
+    "copysign",
+    "signbit",
+    "sign",
+    "minimum",
+    "maximum",
     ]
 
 
@@ -104,11 +118,11 @@ def __eq__(self, other):
             if getattr(self, name) != getattr(other, name):
                 return False
         return True
-    
+
     def __lt__(self,other):
-        # RAM: this is a fix for issue #88 whereby sorting on constants 
+        # RAM: this is a fix for issue #88 whereby sorting on constants
         # that may be of astKind == 'complex' but type(self.value) == int or float
-        # Here we let NumPy sort as it will cast data properly for comparison 
+        # Here we let NumPy sort as it will cast data properly for comparison
         # when the Python built-ins will raise an error.
         if self.astType == 'constant':
             if self.astKind == other.astKind:
@@ -265,13 +279,13 @@ def __str__(self):
 
 _flow_pat = r'[\;\[\:]'
 _dunder_pat = r'(^|[^\w])__[\w]+__($|[^\w])'
-_attr_pat = r'\.\b(?!(real|imag|(\d*[eE]?[+-]?\d+)|\d*j)\b)'
+_attr_pat = r'\.\b(?!(real|imag|(\d*[eE]?[+-]?\d+)|(\d*[eE]?[+-]?\d+j)|(\d*j))\b)'
 _blacklist_re = re.compile(f'{_flow_pat}|{_dunder_pat}|{_attr_pat}')
 
 def stringToExpression(s, types, context, sanitize: bool=True):
     """Given a string, convert it to a tree of ExpressionNode's.
     """
-    # sanitize the string for obvious attack vectors that NumExpr cannot 
+    # sanitize the string for obvious attack vectors that NumExpr cannot
     # parse into its homebrew AST. This is to protect the call to `eval` below.
     # We forbid `;`, `:`. `[` and `__`, and attribute access via '.'.
     # We cannot ban `.real` or `.imag` however...
@@ -281,7 +295,7 @@ def stringToExpression(s, types, context, sanitize: bool=True):
         skip_quotes = re.sub(r'(\'[^\']*\')', '', no_whitespace)
         if _blacklist_re.search(skip_quotes) is not None:
             raise ValueError(f'Expression {s} has forbidden control characters.')
-    
+
     old_ctx = expressions._context.get_current_context()
     try:
         expressions._context.set_new_context(context)
@@ -307,7 +321,7 @@ def stringToExpression(s, types, context, sanitize: bool=True):
 
         # now build the expression
         ex = eval(c, names)
-        
+
         if expressions.isConstant(ex):
             ex = expressions.ConstantNode(ex, expressions.getKind(ex))
         elif not isinstance(ex, expressions.ExpressionNode):
@@ -363,7 +377,7 @@ def getConstants(ast):
         a = 1 + 3j; b = 5.0
         ne.evaluate('a*2 + 15j - b')
     """
-    constant_registers = set([node.reg for node in ast.allOf("constant")]) 
+    constant_registers = set([node.reg for node in ast.allOf("constant")])
     constants_order = sorted([r.node for r in constant_registers])
     constants = [convertConstantToKind(a.value, a.astKind)
                  for a in constants_order]
@@ -519,7 +533,7 @@ def nToChr(reg):
             return bytes([reg.n])
 
     def quadrupleToString(opcode, store, a1=None, a2=None):
-        cop = chr(interpreter.opcodes[opcode]).encode('ascii')
+        cop = chr(interpreter.opcodes[opcode]).encode('latin_1')
         cs = nToChr(store)
         ca1 = nToChr(a1)
         ca2 = nToChr(a2)
@@ -557,7 +571,7 @@ def getContext(kwargs, _frame_depth=1):
             context[name] = value
         else:
             raise ValueError("'%s' must be one of %s" % (name, allowed))
-    
+
     if d:
         raise ValueError("Unknown keyword argument '%s'" % d.popitem()[0])
     if context['truediv'] == 'auto':
@@ -657,7 +671,7 @@ def disassemble(nex):
 
     def parseOp(op):
         name, sig = [*op.rsplit(b'_', 1), ''][:2]
-        return name, sig 
+        return name, sig
 
     def getArg(pc, offset):
         arg = nex.program[pc + (offset if offset < 4 else offset+1)]
@@ -752,7 +766,7 @@ def getArguments(names, local_dict=None, global_dict=None, _frame_depth: int=2):
         if global_dict is None:
             global_dict = frame_globals
 
-        # If `call_frame` is the top frame of the interpreter we can't clear its 
+        # If `call_frame` is the top frame of the interpreter we can't clear its
         # `local_dict`, because it is actually the `global_dict`.
         clear_local_dict = clear_local_dict and not frame_globals is local_dict
 
@@ -774,23 +788,24 @@ def getArguments(names, local_dict=None, global_dict=None, _frame_depth: int=2):
 
 
 # Dictionaries for caching variable names and compiled expressions
-_names_cache = CacheDict(256)
-_numexpr_cache = CacheDict(256)
-_numexpr_last = ContextDict()
+_names_cache = threading.local()
+_numexpr_cache = threading.local()
+_numexpr_last = threading.local()
 evaluate_lock = threading.Lock()
 
-def validate(ex: str, 
-             local_dict: Optional[Dict] = None, 
+
+def validate(ex: str,
+             local_dict: Optional[Dict] = None,
              global_dict: Optional[Dict] = None,
-             out: numpy.ndarray = None, 
-             order: str = 'K', 
-             casting: str = 'safe', 
+             out: numpy.ndarray = None,
+             order: str = 'K',
+             casting: str = 'safe',
              _frame_depth: int = 2,
              sanitize: Optional[bool] = None,
              **kwargs) -> Optional[Exception]:
     r"""
     Validate a NumExpr expression with the given `local_dict` or `locals()`.
-    Returns `None` on success and the Exception object if one occurs. Note that 
+    Returns `None` on success and the Exception object if one occurs. Note that
     you can proceed directly to call `re_evaluate()` if you use `validate()`
     to sanitize your expressions and variables in advance.
 
@@ -835,30 +850,37 @@ def validate(ex: str,
           * 'unsafe' means any data conversions may be done.
 
     sanitize: Optional[bool]
-        Both `validate` and by extension `evaluate` call `eval(ex)`, which is 
-        potentially dangerous on unsanitized inputs. As such, NumExpr by default 
-        performs simple sanitization, banning the character ':;[', the 
+        Both `validate` and by extension `evaluate` call `eval(ex)`, which is
+        potentially dangerous on unsanitized inputs. As such, NumExpr by default
+        performs simple sanitization, banning the character ':;[', the
         dunder '__[\w+]__', and attribute access to all but '.real' and '.imag'.
-        
-        Using `None` defaults to `True` unless the environment variable 
-        `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`. 
+
+        Using `None` defaults to `True` unless the environment variable
+        `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`.
         Nominally this can be set via `os.environ` before `import numexpr`.
 
     _frame_depth: int
-        The calling frame depth. Unless you are a NumExpr developer you should 
+        The calling frame depth. Unless you are a NumExpr developer you should
         not set this value.
 
     Note
     ----
-    
+
     """
-    global _numexpr_last
+    if not hasattr(_numexpr_last, 'l'):
+        _numexpr_last.l = ContextDict()
+
+    if not hasattr(_names_cache, 'c'):
+        _names_cache.c = CacheDict(256)
+
+    if not hasattr(_numexpr_cache, 'c'):
+        _numexpr_cache.c = CacheDict(256)
 
     try:
-        
+
         if not isinstance(ex, str):
             raise ValueError("must specify expression as a string")
-        
+
         if sanitize is None:
             if 'NUMEXPR_SANITIZE' in os.environ:
                 sanitize = bool(int(os.environ['NUMEXPR_SANITIZE']))
@@ -868,9 +890,9 @@ def validate(ex: str,
         # Get the names for this expression
         context = getContext(kwargs)
         expr_key = (ex, tuple(sorted(context.items())))
-        if expr_key not in _names_cache:
-            _names_cache[expr_key] = getExprNames(ex, context, sanitize=sanitize)
-        names, ex_uses_vml = _names_cache[expr_key]
+        if expr_key not in _names_cache.c:
+            _names_cache.c[expr_key] = getExprNames(ex, context, sanitize=sanitize)
+        names, ex_uses_vml = _names_cache.c[expr_key]
         arguments = getArguments(names, local_dict, global_dict, _frame_depth=_frame_depth)
 
         # Create a signature
@@ -880,24 +902,25 @@ def validate(ex: str,
         # Look up numexpr if possible.
         numexpr_key = expr_key + (tuple(signature),)
         try:
-            compiled_ex = _numexpr_cache[numexpr_key]
+            compiled_ex = _numexpr_cache.c[numexpr_key]
         except KeyError:
-            compiled_ex = _numexpr_cache[numexpr_key] = NumExpr(ex, signature, sanitize=sanitize, **context)
+            compiled_ex = _numexpr_cache.c[numexpr_key] = NumExpr(ex, signature, sanitize=sanitize, **context)
         kwargs = {'out': out, 'order': order, 'casting': casting,
-                'ex_uses_vml': ex_uses_vml}
-        _numexpr_last.set(ex=compiled_ex, argnames=names, kwargs=kwargs)
+                  'ex_uses_vml': ex_uses_vml}
+        _numexpr_last.l.set(ex=compiled_ex, argnames=names, kwargs=kwargs)
     except Exception as e:
         return e
     return None
 
-def evaluate(ex: str, 
-             local_dict: Optional[Dict] = None, 
+def evaluate(ex: str,
+             local_dict: Optional[Dict] = None,
              global_dict: Optional[Dict] = None,
-             out: numpy.ndarray = None, 
-             order: str = 'K', 
-             casting: str = 'safe', 
+             out: numpy.ndarray = None,
+             order: str = 'K',
+             casting: str = 'same_kind',
              sanitize: Optional[bool] = None,
              _frame_depth: int = 3,
+             disable_cache: bool = False,
              **kwargs) -> numpy.ndarray:
     r"""
     Evaluate a simple array expression element-wise using the virtual machine.
@@ -943,38 +966,63 @@ def evaluate(ex: str,
           * 'unsafe' means any data conversions may be done.
 
     sanitize: bool
-        Both `validate` and by extension `evaluate` call `eval(ex)`, which is 
-        potentially dangerous on unsanitized inputs. As such, NumExpr by default 
-        performs simple sanitization, banning the character ':;[', the 
+        `validate` (and by extension `evaluate`) call `eval(ex)`, which is
+        potentially dangerous on non-sanitized inputs. As such, NumExpr by default
+        performs simple sanitization, banning the characters ':;[', the
         dunder '__[\w+]__', and attribute access to all but '.real' and '.imag'.
 
-        Using `None` defaults to `True` unless the environment variable 
-        `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`. 
+        Using `None` defaults to `True` unless the environment variable
+        `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`.
         Nominally this can be set via `os.environ` before `import numexpr`.
 
     _frame_depth: int
-        The calling frame depth. Unless you are a NumExpr developer you should 
+        The calling frame depth. Unless you are a NumExpr developer you should
         not set this value.
 
-    Note
-    ----
-    Both `validate` and by extension `evaluate` call `eval(ex)`, which is 
-    potentially dangerous on unsanitized inputs. As such, NumExpr does some 
-    sanitization, banning the character ':;[', the dunder '__', and attribute
-    access to all but '.r' for real and '.i' for imag access to complex numbers.
+    disable_cache: bool
+        If set to be `True`, disables the uses of internal expression cache.
+
+        By default, NumExpr caches compiled expressions and associated metadata
+        (via the internal `_numexpr_last`, `_numexpr_cache`, and `_names_cache`
+        structures). This allows repeated evaluations of the same expression
+        to skip recompilation, improving performance in workloads where the same
+        expression is executed multiple times.
+
+        However, caching retains references to input and output arrays in order
+        to support re-evaluation. As a result, this can increase their reference
+        counts and may prevent them from being garbage-collected immediately.
+        In situations where precise control over object lifetimes or memory
+        management is required, set `disable_cache=True` to avoid this behavior.
+
+        Default is `False`.
+
     """
-    # We could avoid code duplication if we called validate and then re_evaluate 
-    # here, but they we have difficulties with the `sys.getframe(2)` call in
+    # We could avoid code duplication if we called validate and then re_evaluate
+    # here, but we have difficulties with the `sys.getframe(2)` call in
     # `getArguments`
-    e = validate(ex, local_dict=local_dict, global_dict=global_dict, 
-                 out=out, order=order, casting=casting, 
+
+    # If dissable_cache set to be True, we evaluate the expression here
+    # Otherwise we validate and then re_evaluate
+    if disable_cache:
+        context = getContext(kwargs)
+        names, ex_uses_vml = getExprNames(ex, context, sanitize=sanitize)
+        arguments = getArguments(names, local_dict, global_dict, _frame_depth=_frame_depth - 1)
+        signature = [(name, getType(arg)) for (name, arg) in
+                     zip(names, arguments)]
+        compiled_ex = NumExpr(ex, signature, sanitize=sanitize, **context)
+        kwargs = {'out': out, 'order': order, 'casting': casting,
+                  'ex_uses_vml': ex_uses_vml}
+        return compiled_ex(*arguments, **kwargs)
+
+    e = validate(ex, local_dict=local_dict, global_dict=global_dict,
+                 out=out, order=order, casting=casting,
                  _frame_depth=_frame_depth, sanitize=sanitize, **kwargs)
     if e is None:
         return re_evaluate(local_dict=local_dict, global_dict=global_dict, _frame_depth=_frame_depth)
     else:
         raise e
-    
-def re_evaluate(local_dict: Optional[Dict] = None, 
+
+def re_evaluate(local_dict: Optional[Dict] = None,
                 global_dict: Optional[Dict] = None,
                 _frame_depth: int=2) -> numpy.ndarray:
     """
@@ -989,17 +1037,18 @@ def re_evaluate(local_dict: Optional[Dict] = None,
     local_dict: dictionary, optional
         A dictionary that replaces the local operands in current frame.
     _frame_depth: int
-        The calling frame depth. Unless you are a NumExpr developer you should 
+        The calling frame depth. Unless you are a NumExpr developer you should
         not set this value.
     """
-    global _numexpr_last
+    if not hasattr(_numexpr_last, 'l'):
+        _numexpr_last.l = ContextDict()
 
     try:
-        compiled_ex = _numexpr_last['ex']
+        compiled_ex = _numexpr_last.l['ex']
     except KeyError:
         raise RuntimeError("A previous evaluate() execution was not found, please call `validate` or `evaluate` once before `re_evaluate`")
-    argnames = _numexpr_last['argnames']
+    argnames = _numexpr_last.l['argnames']
     args = getArguments(argnames, local_dict, global_dict, _frame_depth=_frame_depth)
-    kwargs = _numexpr_last['kwargs']
-    with evaluate_lock:
-        return compiled_ex(*args, **kwargs)
+    kwargs = _numexpr_last.l['kwargs']
+    # with evaluate_lock:
+    return compiled_ex(*args, **kwargs)
diff --git a/numexpr/numexpr_config.hpp b/numexpr/numexpr_config.hpp
index 0663c6d..5df0c35 100644
--- a/numexpr/numexpr_config.hpp
+++ b/numexpr/numexpr_config.hpp
@@ -19,7 +19,7 @@
 #define BLOCK_SIZE1 1024
 #endif
 
-// The default threadpool size. It's prefer that the user set this via an 
+// The default threadpool size. It's prefer that the user set this via an
 // environment variable, "NUMEXPR_MAX_THREADS"
 #define DEFAULT_MAX_THREADS 64
 
@@ -40,12 +40,32 @@
 #include "mkl_vml.h"
 #include "mkl_service.h"
 #endif
+#include <cmath>
+//no single precision version of signbit in C++ standard
+inline bool signbitf(float x) { return signbit((double)x); }
 
 #ifdef _WIN32
   #ifndef __MINGW32__
     #include "missing_posix_functions.hpp"
   #endif
   #include "msvc_function_stubs.hpp"
+#else
+/* GCC/Clang version: use std:: (can't use it for windows)
+   msvc_function_stubs contains windows alternatives */
+/* Due to casting problems (normally return ints not bools, easiest to define
+   non-overloaded wrappers for these functions) */
+inline bool isfinitef_(float x) { return !!std::isfinite(x); }
+inline bool isnanf_(float x)    { return !!std::isnan(x); }
+inline bool isfinited(double x) { return !!std::isfinite(x); }
+inline bool isnand(double x)    { return !!std::isnan(x); }
+inline bool isinff_(float x) { return !!std::isinf(x); }
+inline bool isinfd(double x)    { return !!std::isinf(x); }
+
+// To handle overloading of fmax/fmin in cmath and match NumPy behaviour for NaNs
+inline double fmaxd(double x, double y)    { return (isnand(x) | isnand(y))? NAN : fmax(x, y); }
+inline double fmind(double x, double y)    { return (isnand(x) | isnand(y))? NAN : fmin(x, y); }
+inline float fmaxf_(float x, float y)    { return (isnanf_(x) | isnanf_(y))? NAN : fmaxf(x, y); }
+inline float fminf_(float x, float y)    { return (isnanf_(x) | isnanf_(y))? NAN : fminf(x, y); }
 #endif
 
 #endif // NUMEXPR_CONFIG_HPP
diff --git a/numexpr/numexpr_object.cpp b/numexpr/numexpr_object.cpp
index e788d1c..b20aef0 100644
--- a/numexpr/numexpr_object.cpp
+++ b/numexpr/numexpr_object.cpp
@@ -405,4 +405,3 @@ PyTypeObject NumExprType = {
     0,                         /* tp_alloc */
     NumExpr_new,               /* tp_new */
 };
-
diff --git a/numexpr/opcodes.hpp b/numexpr/opcodes.hpp
index 086c98e..5b1c46f 100644
--- a/numexpr/opcodes.hpp
+++ b/numexpr/opcodes.hpp
@@ -15,6 +15,9 @@ OPCODE(n, enum_name, exported, return_type, arg1_type, arg2_type, arg3_type)
 Types are Tb, Ti, Tl, Tf, Td, Tc, Ts, Tn, and T0; these symbols should be
 #defined to whatever is needed. (T0 is the no-such-arg type.)
 
+When adding new OPCODES, one has to respect the order of the numeration, as
+there are parts of the code (iterations) which assume that the OPCODES are ordered.
+
 */
 OPCODE(0, OP_NOOP, "noop", T0, T0, T0, T0)
 
@@ -23,157 +26,189 @@ OPCODE(1, OP_COPY_BB, "copy_bb", Tb, Tb, T0, T0)
 OPCODE(2, OP_INVERT_BB, "invert_bb", Tb, Tb, T0, T0)
 OPCODE(3, OP_AND_BBB, "and_bbb", Tb, Tb, Tb, T0)
 OPCODE(4, OP_OR_BBB, "or_bbb", Tb, Tb, Tb, T0)
-
-OPCODE(5, OP_EQ_BBB, "eq_bbb", Tb, Tb, Tb, T0)
-OPCODE(6, OP_NE_BBB, "ne_bbb", Tb, Tb, Tb, T0)
-
-OPCODE(7, OP_GT_BII, "gt_bii", Tb, Ti, Ti, T0)
-OPCODE(8, OP_GE_BII, "ge_bii", Tb, Ti, Ti, T0)
-OPCODE(9, OP_EQ_BII, "eq_bii", Tb, Ti, Ti, T0)
-OPCODE(10, OP_NE_BII, "ne_bii", Tb, Ti, Ti, T0)
-
-OPCODE(11, OP_GT_BLL, "gt_bll", Tb, Tl, Tl, T0)
-OPCODE(12, OP_GE_BLL, "ge_bll", Tb, Tl, Tl, T0)
-OPCODE(13, OP_EQ_BLL, "eq_bll", Tb, Tl, Tl, T0)
-OPCODE(14, OP_NE_BLL, "ne_bll", Tb, Tl, Tl, T0)
-
-OPCODE(15, OP_GT_BFF, "gt_bff", Tb, Tf, Tf, T0)
-OPCODE(16, OP_GE_BFF, "ge_bff", Tb, Tf, Tf, T0)
-OPCODE(17, OP_EQ_BFF, "eq_bff", Tb, Tf, Tf, T0)
-OPCODE(18, OP_NE_BFF, "ne_bff", Tb, Tf, Tf, T0)
-
-OPCODE(19, OP_GT_BDD, "gt_bdd", Tb, Td, Td, T0)
-OPCODE(20, OP_GE_BDD, "ge_bdd", Tb, Td, Td, T0)
-OPCODE(21, OP_EQ_BDD, "eq_bdd", Tb, Td, Td, T0)
-OPCODE(22, OP_NE_BDD, "ne_bdd", Tb, Td, Td, T0)
-
-OPCODE(23, OP_GT_BSS, "gt_bss", Tb, Ts, Ts, T0)
-OPCODE(24, OP_GE_BSS, "ge_bss", Tb, Ts, Ts, T0)
-OPCODE(25, OP_EQ_BSS, "eq_bss", Tb, Ts, Ts, T0)
-OPCODE(26, OP_NE_BSS, "ne_bss", Tb, Ts, Ts, T0)
-
-OPCODE(27, OP_CAST_IB, "cast_ib", Ti, Tb, T0, T0)
-OPCODE(28, OP_COPY_II, "copy_ii", Ti, Ti, T0, T0)
-OPCODE(29, OP_ONES_LIKE_II, "ones_like_ii", Ti, T0, T0, T0)
-OPCODE(30, OP_NEG_II, "neg_ii", Ti, Ti, T0, T0)
-OPCODE(31, OP_ADD_III, "add_iii", Ti, Ti, Ti, T0)
-OPCODE(32, OP_SUB_III, "sub_iii", Ti, Ti, Ti, T0)
-OPCODE(33, OP_MUL_III, "mul_iii", Ti, Ti, Ti, T0)
-OPCODE(34, OP_DIV_III, "div_iii", Ti, Ti, Ti, T0)
-OPCODE(35, OP_POW_III, "pow_iii", Ti, Ti, Ti, T0)
-OPCODE(36, OP_MOD_III, "mod_iii", Ti, Ti, Ti, T0)
-
-OPCODE(37, OP_LSHIFT_III, "lshift_iii", Ti, Ti, Ti, T0)
-OPCODE(38, OP_RSHIFT_III, "rshift_iii", Ti, Ti, Ti, T0)
-
-OPCODE(39, OP_WHERE_IBII, "where_ibii", Ti, Tb, Ti, Ti)
-
-OPCODE(40, OP_CAST_LI, "cast_li", Tl, Ti, T0, T0)
-OPCODE(41, OP_COPY_LL, "copy_ll", Tl, Tl, T0, T0)
-OPCODE(42, OP_ONES_LIKE_LL, "ones_like_ll", Tl, T0, T0, T0)
-OPCODE(43, OP_NEG_LL, "neg_ll", Tl, Tl, T0, T0)
-OPCODE(44, OP_ADD_LLL, "add_lll", Tl, Tl, Tl, T0)
-OPCODE(45, OP_SUB_LLL, "sub_lll", Tl, Tl, Tl, T0)
-OPCODE(46, OP_MUL_LLL, "mul_lll", Tl, Tl, Tl, T0)
-OPCODE(47, OP_DIV_LLL, "div_lll", Tl, Tl, Tl, T0)
-OPCODE(48, OP_POW_LLL, "pow_lll", Tl, Tl, Tl, T0)
-OPCODE(49, OP_MOD_LLL, "mod_lll", Tl, Tl, Tl, T0)
-
-OPCODE(50, OP_LSHIFT_LLL, "lshift_lll", Tl, Tl, Tl, T0)
-OPCODE(51, OP_RSHIFT_LLL, "rshift_lll", Tl, Tl, Tl, T0)
-
-OPCODE(52, OP_WHERE_LBLL, "where_lbll", Tl, Tb, Tl, Tl)
-
-OPCODE(53, OP_CAST_FI, "cast_fi", Tf, Ti, T0, T0)
-OPCODE(54, OP_CAST_FL, "cast_fl", Tf, Tl, T0, T0)
-OPCODE(55, OP_COPY_FF, "copy_ff", Tf, Tf, T0, T0)
-OPCODE(56, OP_ONES_LIKE_FF, "ones_like_ff", Tf, T0, T0, T0)
-OPCODE(57, OP_NEG_FF, "neg_ff", Tf, Tf, T0, T0)
-OPCODE(58, OP_ADD_FFF, "add_fff", Tf, Tf, Tf, T0)
-OPCODE(59, OP_SUB_FFF, "sub_fff", Tf, Tf, Tf, T0)
-OPCODE(60, OP_MUL_FFF, "mul_fff", Tf, Tf, Tf, T0)
-OPCODE(61, OP_DIV_FFF, "div_fff", Tf, Tf, Tf, T0)
-OPCODE(62, OP_POW_FFF, "pow_fff", Tf, Tf, Tf, T0)
-OPCODE(63, OP_MOD_FFF, "mod_fff", Tf, Tf, Tf, T0)
-OPCODE(64, OP_SQRT_FF, "sqrt_ff", Tf, Tf, T0, T0)
-OPCODE(65, OP_WHERE_FBFF, "where_fbff", Tf, Tb, Tf, Tf)
-OPCODE(66, OP_FUNC_FFN, "func_ffn", Tf, Tf, Tn, T0)
-OPCODE(67, OP_FUNC_FFFN, "func_fffn", Tf, Tf, Tf, Tn)
-
-OPCODE(68, OP_CAST_DI, "cast_di", Td, Ti, T0, T0)
-OPCODE(69, OP_CAST_DL, "cast_dl", Td, Tl, T0, T0)
-OPCODE(70, OP_CAST_DF, "cast_df", Td, Tf, T0, T0)
-OPCODE(71, OP_COPY_DD, "copy_dd", Td, Td, T0, T0)
-OPCODE(72, OP_ONES_LIKE_DD, "ones_like_dd", Td, T0, T0, T0)
-OPCODE(73, OP_NEG_DD, "neg_dd", Td, Td, T0, T0)
-OPCODE(74, OP_ADD_DDD, "add_ddd", Td, Td, Td, T0)
-OPCODE(75, OP_SUB_DDD, "sub_ddd", Td, Td, Td, T0)
-OPCODE(76, OP_MUL_DDD, "mul_ddd", Td, Td, Td, T0)
-OPCODE(77, OP_DIV_DDD, "div_ddd", Td, Td, Td, T0)
-OPCODE(78, OP_POW_DDD, "pow_ddd", Td, Td, Td, T0)
-OPCODE(79, OP_MOD_DDD, "mod_ddd", Td, Td, Td, T0)
-OPCODE(80, OP_SQRT_DD, "sqrt_dd", Td, Td, T0, T0)
-OPCODE(81, OP_WHERE_DBDD, "where_dbdd", Td, Tb, Td, Td)
-OPCODE(82, OP_FUNC_DDN, "func_ddn", Td, Td, Tn, T0)
-OPCODE(83, OP_FUNC_DDDN, "func_dddn", Td, Td, Td, Tn)
-
-OPCODE(84, OP_EQ_BCC, "eq_bcc", Tb, Tc, Tc, T0)
-OPCODE(85, OP_NE_BCC, "ne_bcc", Tb, Tc, Tc, T0)
-
-OPCODE(86, OP_CAST_CI, "cast_ci", Tc, Ti, T0, T0)
-OPCODE(87, OP_CAST_CL, "cast_cl", Tc, Tl, T0, T0)
-OPCODE(88, OP_CAST_CF, "cast_cf", Tc, Tf, T0, T0)
-OPCODE(89, OP_CAST_CD, "cast_cd", Tc, Td, T0, T0)
-OPCODE(90, OP_ONES_LIKE_CC, "ones_like_cc", Tc, T0, T0, T0)
-OPCODE(91, OP_COPY_CC, "copy_cc", Tc, Tc, T0, T0)
-OPCODE(92, OP_NEG_CC, "neg_cc", Tc, Tc, T0, T0)
-OPCODE(93, OP_ADD_CCC, "add_ccc", Tc, Tc, Tc, T0)
-OPCODE(94, OP_SUB_CCC, "sub_ccc", Tc, Tc, Tc, T0)
-OPCODE(95, OP_MUL_CCC, "mul_ccc", Tc, Tc, Tc, T0)
-OPCODE(96, OP_DIV_CCC, "div_ccc", Tc, Tc, Tc, T0)
-OPCODE(97, OP_WHERE_CBCC, "where_cbcc", Tc, Tb, Tc, Tc)
-OPCODE(98, OP_FUNC_CCN, "func_ccn", Tc, Tc, Tn, T0)
-OPCODE(99, OP_FUNC_CCCN, "func_cccn", Tc, Tc, Tc, Tn)
-
-OPCODE(100, OP_REAL_DC, "real_dc", Td, Tc, T0, T0)
-OPCODE(101, OP_IMAG_DC, "imag_dc", Td, Tc, T0, T0)
-OPCODE(102, OP_COMPLEX_CDD, "complex_cdd", Tc, Td, Td, T0)
-
-OPCODE(103, OP_COPY_SS, "copy_ss", Ts, Ts, T0, T0)
-
-OPCODE(104, OP_WHERE_BBBB, "where_bbbb", Tb, Tb, Tb, Tb)
-
-OPCODE(105, OP_CONTAINS_BSS, "contains_bss", Tb, Ts, Ts, T0)
-
-OPCODE(106, OP_REDUCTION, NULL, T0, T0, T0, T0)
+OPCODE(5, OP_XOR_BBB, "xor_bbb", Tb, Tb, Tb, T0)
+
+OPCODE(6, OP_EQ_BBB, "eq_bbb", Tb, Tb, Tb, T0)
+OPCODE(7, OP_NE_BBB, "ne_bbb", Tb, Tb, Tb, T0)
+
+OPCODE(8, OP_GT_BII, "gt_bii", Tb, Ti, Ti, T0)
+OPCODE(9, OP_GE_BII, "ge_bii", Tb, Ti, Ti, T0)
+OPCODE(10, OP_EQ_BII, "eq_bii", Tb, Ti, Ti, T0)
+OPCODE(11, OP_NE_BII, "ne_bii", Tb, Ti, Ti, T0)
+
+OPCODE(12, OP_GT_BLL, "gt_bll", Tb, Tl, Tl, T0)
+OPCODE(13, OP_GE_BLL, "ge_bll", Tb, Tl, Tl, T0)
+OPCODE(14, OP_EQ_BLL, "eq_bll", Tb, Tl, Tl, T0)
+OPCODE(15, OP_NE_BLL, "ne_bll", Tb, Tl, Tl, T0)
+
+OPCODE(16, OP_GT_BFF, "gt_bff", Tb, Tf, Tf, T0)
+OPCODE(17, OP_GE_BFF, "ge_bff", Tb, Tf, Tf, T0)
+OPCODE(18, OP_EQ_BFF, "eq_bff", Tb, Tf, Tf, T0)
+OPCODE(19, OP_NE_BFF, "ne_bff", Tb, Tf, Tf, T0)
+
+OPCODE(20, OP_GT_BDD, "gt_bdd", Tb, Td, Td, T0)
+OPCODE(21, OP_GE_BDD, "ge_bdd", Tb, Td, Td, T0)
+OPCODE(22, OP_EQ_BDD, "eq_bdd", Tb, Td, Td, T0)
+OPCODE(23, OP_NE_BDD, "ne_bdd", Tb, Td, Td, T0)
+
+OPCODE(24, OP_GT_BSS, "gt_bss", Tb, Ts, Ts, T0)
+OPCODE(25, OP_GE_BSS, "ge_bss", Tb, Ts, Ts, T0)
+OPCODE(26, OP_EQ_BSS, "eq_bss", Tb, Ts, Ts, T0)
+OPCODE(27, OP_NE_BSS, "ne_bss", Tb, Ts, Ts, T0)
+
+OPCODE(28, OP_CAST_IB, "cast_ib", Ti, Tb, T0, T0)
+OPCODE(29, OP_COPY_II, "copy_ii", Ti, Ti, T0, T0)
+OPCODE(30, OP_ONES_LIKE_II, "ones_like_ii", Ti, T0, T0, T0)
+OPCODE(31, OP_NEG_II, "neg_ii", Ti, Ti, T0, T0)
+OPCODE(32, OP_ADD_III, "add_iii", Ti, Ti, Ti, T0)
+OPCODE(33, OP_SUB_III, "sub_iii", Ti, Ti, Ti, T0)
+OPCODE(34, OP_MUL_III, "mul_iii", Ti, Ti, Ti, T0)
+OPCODE(35, OP_DIV_III, "div_iii", Ti, Ti, Ti, T0)
+OPCODE(36, OP_POW_III, "pow_iii", Ti, Ti, Ti, T0)
+OPCODE(37, OP_MOD_III, "mod_iii", Ti, Ti, Ti, T0)
+OPCODE(38, OP_FLOORDIV_III, "floordiv_iii", Ti, Ti, Ti, T0)
+
+
+OPCODE(39, OP_LSHIFT_III, "lshift_iii", Ti, Ti, Ti, T0)
+OPCODE(40, OP_RSHIFT_III, "rshift_iii", Ti, Ti, Ti, T0)
+
+OPCODE(41, OP_WHERE_IBII, "where_ibii", Ti, Tb, Ti, Ti)
+// Bitwise ops
+OPCODE(42, OP_INVERT_II, "invert_ii", Ti, Ti, T0, T0)
+OPCODE(43, OP_AND_III, "and_iii", Ti, Ti, Ti, T0)
+OPCODE(44, OP_OR_III, "or_iii", Ti, Ti, Ti, T0)
+OPCODE(45, OP_XOR_III, "xor_iii", Ti, Ti, Ti, T0)
+
+OPCODE(46, OP_CAST_LI, "cast_li", Tl, Ti, T0, T0)
+OPCODE(47, OP_COPY_LL, "copy_ll", Tl, Tl, T0, T0)
+OPCODE(48, OP_ONES_LIKE_LL, "ones_like_ll", Tl, T0, T0, T0)
+OPCODE(49, OP_NEG_LL, "neg_ll", Tl, Tl, T0, T0)
+OPCODE(50, OP_ADD_LLL, "add_lll", Tl, Tl, Tl, T0)
+OPCODE(51, OP_SUB_LLL, "sub_lll", Tl, Tl, Tl, T0)
+OPCODE(52, OP_MUL_LLL, "mul_lll", Tl, Tl, Tl, T0)
+OPCODE(53, OP_DIV_LLL, "div_lll", Tl, Tl, Tl, T0)
+OPCODE(54, OP_POW_LLL, "pow_lll", Tl, Tl, Tl, T0)
+OPCODE(55, OP_MOD_LLL, "mod_lll", Tl, Tl, Tl, T0)
+OPCODE(56, OP_FLOORDIV_LLL, "floordiv_lll", Tl, Tl, Tl, T0)
+
+OPCODE(57, OP_LSHIFT_LLL, "lshift_lll", Tl, Tl, Tl, T0)
+OPCODE(58, OP_RSHIFT_LLL, "rshift_lll", Tl, Tl, Tl, T0)
+
+OPCODE(59, OP_WHERE_LBLL, "where_lbll", Tl, Tb, Tl, Tl)
+// Bitwise ops
+OPCODE(60, OP_INVERT_LL, "invert_ll", Tl, Tl, T0, T0)
+OPCODE(61, OP_AND_LLL, "and_lll", Tl, Tl, Tl, T0)
+OPCODE(62, OP_OR_LLL, "or_lll", Tl, Tl, Tl, T0)
+OPCODE(63, OP_XOR_LLL, "xor_lll", Tl, Tl, Tl, T0)
+
+OPCODE(64, OP_CAST_FI, "cast_fi", Tf, Ti, T0, T0)
+OPCODE(65, OP_CAST_FL, "cast_fl", Tf, Tl, T0, T0)
+OPCODE(66, OP_COPY_FF, "copy_ff", Tf, Tf, T0, T0)
+OPCODE(67, OP_ONES_LIKE_FF, "ones_like_ff", Tf, T0, T0, T0)
+OPCODE(68, OP_NEG_FF, "neg_ff", Tf, Tf, T0, T0)
+OPCODE(69, OP_ADD_FFF, "add_fff", Tf, Tf, Tf, T0)
+OPCODE(70, OP_SUB_FFF, "sub_fff", Tf, Tf, Tf, T0)
+OPCODE(71, OP_MUL_FFF, "mul_fff", Tf, Tf, Tf, T0)
+OPCODE(72, OP_DIV_FFF, "div_fff", Tf, Tf, Tf, T0)
+OPCODE(73, OP_POW_FFF, "pow_fff", Tf, Tf, Tf, T0)
+OPCODE(74, OP_MOD_FFF, "mod_fff", Tf, Tf, Tf, T0)
+OPCODE(75, OP_FLOORDIV_FFF, "floordiv_fff", Tf, Tf, Tf, T0)
+OPCODE(76, OP_SQRT_FF, "sqrt_ff", Tf, Tf, T0, T0)
+OPCODE(77, OP_WHERE_FBFF, "where_fbff", Tf, Tb, Tf, Tf)
+
+OPCODE(78, OP_FUNC_FFN, "func_ffn", Tf, Tf, Tn, T0)
+OPCODE(79, OP_FUNC_FFFN, "func_fffn", Tf, Tf, Tf, Tn)
+
+OPCODE(80, OP_CAST_DI, "cast_di", Td, Ti, T0, T0)
+OPCODE(81, OP_CAST_DL, "cast_dl", Td, Tl, T0, T0)
+OPCODE(82, OP_CAST_DF, "cast_df", Td, Tf, T0, T0)
+OPCODE(83, OP_COPY_DD, "copy_dd", Td, Td, T0, T0)
+OPCODE(84, OP_ONES_LIKE_DD, "ones_like_dd", Td, T0, T0, T0)
+OPCODE(85, OP_NEG_DD, "neg_dd", Td, Td, T0, T0)
+OPCODE(86, OP_ADD_DDD, "add_ddd", Td, Td, Td, T0)
+OPCODE(87, OP_SUB_DDD, "sub_ddd", Td, Td, Td, T0)
+OPCODE(88, OP_MUL_DDD, "mul_ddd", Td, Td, Td, T0)
+OPCODE(89, OP_DIV_DDD, "div_ddd", Td, Td, Td, T0)
+OPCODE(90, OP_POW_DDD, "pow_ddd", Td, Td, Td, T0)
+OPCODE(91, OP_MOD_DDD, "mod_ddd", Td, Td, Td, T0)
+OPCODE(92, OP_FLOORDIV_DDD, "floordiv_ddd", Td, Td, Td, T0)
+
+OPCODE(93, OP_SQRT_DD, "sqrt_dd", Td, Td, T0, T0)
+OPCODE(94, OP_WHERE_DBDD, "where_dbdd", Td, Tb, Td, Td)
+OPCODE(95, OP_FUNC_DDN, "func_ddn", Td, Td, Tn, T0)
+OPCODE(96, OP_FUNC_DDDN, "func_dddn", Td, Td, Td, Tn)
+
+OPCODE(97, OP_EQ_BCC, "eq_bcc", Tb, Tc, Tc, T0)
+OPCODE(98, OP_NE_BCC, "ne_bcc", Tb, Tc, Tc, T0)
+
+OPCODE(99, OP_CAST_CI, "cast_ci", Tc, Ti, T0, T0)
+OPCODE(100, OP_CAST_CL, "cast_cl", Tc, Tl, T0, T0)
+OPCODE(101, OP_CAST_CF, "cast_cf", Tc, Tf, T0, T0)
+OPCODE(102, OP_CAST_CD, "cast_cd", Tc, Td, T0, T0)
+OPCODE(103, OP_ONES_LIKE_CC, "ones_like_cc", Tc, T0, T0, T0)
+OPCODE(104, OP_COPY_CC, "copy_cc", Tc, Tc, T0, T0)
+OPCODE(105, OP_NEG_CC, "neg_cc", Tc, Tc, T0, T0)
+OPCODE(106, OP_ADD_CCC, "add_ccc", Tc, Tc, Tc, T0)
+OPCODE(107, OP_SUB_CCC, "sub_ccc", Tc, Tc, Tc, T0)
+OPCODE(108, OP_MUL_CCC, "mul_ccc", Tc, Tc, Tc, T0)
+OPCODE(109, OP_DIV_CCC, "div_ccc", Tc, Tc, Tc, T0)
+OPCODE(110, OP_WHERE_CBCC, "where_cbcc", Tc, Tb, Tc, Tc)
+OPCODE(111, OP_FUNC_CCN, "func_ccn", Tc, Tc, Tn, T0)
+OPCODE(112, OP_FUNC_CCCN, "func_cccn", Tc, Tc, Tc, Tn)
+
+OPCODE(113, OP_REAL_DC, "real_dc", Td, Tc, T0, T0)
+OPCODE(114, OP_IMAG_DC, "imag_dc", Td, Tc, T0, T0)
+OPCODE(115, OP_COMPLEX_CDD, "complex_cdd", Tc, Td, Td, T0)
+
+OPCODE(116, OP_COPY_SS, "copy_ss", Ts, Ts, T0, T0)
+
+OPCODE(117, OP_WHERE_BBBB, "where_bbbb", Tb, Tb, Tb, Tb)
+
+OPCODE(118, OP_CONTAINS_BSS, "contains_bss", Tb, Ts, Ts, T0)
+//Boolean outputs
+OPCODE(119, OP_FUNC_BDN, "func_bdn", Tb, Td, Tn, T0)
+OPCODE(120, OP_FUNC_BFN, "func_bfn", Tb, Tf, Tn, T0)
+OPCODE(121, OP_FUNC_BCN, "func_bcn", Tb, Tc, Tn, T0)
+//Integer funcs
+OPCODE(122, OP_FUNC_IIN, "func_iin", Ti, Ti, Tn, T0)
+OPCODE(123, OP_FUNC_LLN, "func_lln", Tl, Tl, Tn, T0)
+
+// Reductions always have to be at the end - parts of the code
+// use > OP_REDUCTION to decide whether operation is a reduction
+OPCODE(124, OP_REDUCTION, NULL, T0, T0, T0, T0)
 
 /* Last argument in a reduction is the axis of the array the
    reduction should be applied along. */
 
-OPCODE(107, OP_SUM_IIN, "sum_iin", Ti, Ti, Tn, T0)
-OPCODE(108, OP_SUM_LLN, "sum_lln", Tl, Tl, Tn, T0)
-OPCODE(109, OP_SUM_FFN, "sum_ffn", Tf, Tf, Tn, T0)
-OPCODE(110, OP_SUM_DDN, "sum_ddn", Td, Td, Tn, T0)
-OPCODE(111, OP_SUM_CCN, "sum_ccn", Tc, Tc, Tn, T0)
-
-OPCODE(112, OP_PROD, NULL, T0, T0, T0, T0)
-OPCODE(113, OP_PROD_IIN, "prod_iin", Ti, Ti, Tn, T0)
-OPCODE(114, OP_PROD_LLN, "prod_lln", Tl, Tl, Tn, T0)
-OPCODE(115, OP_PROD_FFN, "prod_ffn", Tf, Tf, Tn, T0)
-OPCODE(116, OP_PROD_DDN, "prod_ddn", Td, Td, Tn, T0)
-OPCODE(117, OP_PROD_CCN, "prod_ccn", Tc, Tc, Tn, T0)
-
-OPCODE(118, OP_MIN, NULL, T0, T0, T0, T0)
-OPCODE(119, OP_MIN_IIN, "min_iin", Ti, Ti, Tn, T0)
-OPCODE(120, OP_MIN_LLN, "min_lln", Tl, Tl, Tn, T0)
-OPCODE(121, OP_MIN_FFN, "min_ffn", Tf, Tf, Tn, T0)
-OPCODE(122, OP_MIN_DDN, "min_ddn", Td, Td, Tn, T0)
-
-OPCODE(123, OP_MAX, NULL, T0, T0, T0, T0)
-OPCODE(124, OP_MAX_IIN, "max_iin", Ti, Ti, Tn, T0)
-OPCODE(125, OP_MAX_LLN, "max_lln", Tl, Tl, Tn, T0)
-OPCODE(126, OP_MAX_FFN, "max_ffn", Tf, Tf, Tn, T0)
-OPCODE(127, OP_MAX_DDN, "max_ddn", Td, Td, Tn, T0)
+OPCODE(125, OP_SUM_IIN, "sum_iin", Ti, Ti, Tn, T0)
+OPCODE(126, OP_SUM_LLN, "sum_lln", Tl, Tl, Tn, T0)
+OPCODE(127, OP_SUM_FFN, "sum_ffn", Tf, Tf, Tn, T0)
+OPCODE(128, OP_SUM_DDN, "sum_ddn", Td, Td, Tn, T0)
+OPCODE(129, OP_SUM_CCN, "sum_ccn", Tc, Tc, Tn, T0)
+
+OPCODE(130, OP_PROD, NULL, T0, T0, T0, T0)
+OPCODE(131, OP_PROD_IIN, "prod_iin", Ti, Ti, Tn, T0)
+OPCODE(132, OP_PROD_LLN, "prod_lln", Tl, Tl, Tn, T0)
+OPCODE(133, OP_PROD_FFN, "prod_ffn", Tf, Tf, Tn, T0)
+OPCODE(134, OP_PROD_DDN, "prod_ddn", Td, Td, Tn, T0)
+OPCODE(135, OP_PROD_CCN, "prod_ccn", Tc, Tc, Tn, T0)
+
+OPCODE(136, OP_MIN, NULL, T0, T0, T0, T0)
+OPCODE(137, OP_MIN_IIN, "min_iin", Ti, Ti, Tn, T0)
+OPCODE(138, OP_MIN_LLN, "min_lln", Tl, Tl, Tn, T0)
+OPCODE(139, OP_MIN_FFN, "min_ffn", Tf, Tf, Tn, T0)
+OPCODE(140, OP_MIN_DDN, "min_ddn", Td, Td, Tn, T0)
+
+OPCODE(141, OP_MAX, NULL, T0, T0, T0, T0)
+OPCODE(142, OP_MAX_IIN, "max_iin", Ti, Ti, Tn, T0)
+OPCODE(143, OP_MAX_LLN, "max_lln", Tl, Tl, Tn, T0)
+OPCODE(144, OP_MAX_FFN, "max_ffn", Tf, Tf, Tn, T0)
+OPCODE(145, OP_MAX_DDN, "max_ddn", Td, Td, Tn, T0)
 
+/*
+When we get to 255, will maybe have to change code again
+(change latin_1 encoding in necompiler.py, use something
+other than unsigned char for OPCODE table)
+*/
 /* Should be the last opcode */
-OPCODE(128, OP_END, NULL, T0, T0, T0, T0)
+OPCODE(146, OP_END, NULL, T0, T0, T0, T0)
diff --git a/numexpr/tests/__init__.py b/numexpr/tests/__init__.py
index 3fff411..f47c8cc 100644
--- a/numexpr/tests/__init__.py
+++ b/numexpr/tests/__init__.py
@@ -8,7 +8,7 @@
 #  rights to use.
 ####################################################################
 
-from numexpr.tests.test_numexpr import test, print_versions
+from numexpr.tests.test_numexpr import print_versions, test
 
 if __name__ == '__main__':
     test()
diff --git a/numexpr/tests/conftest.py b/numexpr/tests/conftest.py
new file mode 100644
index 0000000..ea8b30f
--- /dev/null
+++ b/numexpr/tests/conftest.py
@@ -0,0 +1,21 @@
+###################################################################
+#  Numexpr - Fast numerical array expression evaluator for NumPy.
+#
+#      License: MIT
+#      Author:  See AUTHORS.txt
+#
+#  See LICENSE.txt and LICENSES/*.txt for details about copyright and
+#  rights to use.
+####################################################################
+
+import pytest
+
+import numexpr
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "thread_unsafe: mark test as unsafe for parallel execution"
+    )
+    print("")
+    numexpr.print_versions()
diff --git a/numexpr/tests/test_numexpr.py b/numexpr/tests/test_numexpr.py
index 2aa56ca..46fad29 100644
--- a/numexpr/tests/test_numexpr.py
+++ b/numexpr/tests/test_numexpr.py
@@ -11,31 +11,39 @@
 
 
 import os
-import sys
 import platform
+import subprocess
+import sys
+import unittest
 import warnings
 from contextlib import contextmanager
-import subprocess
+from unittest.mock import MagicMock
 
-import numpy as np
-from numpy import (
-    array, arange, empty, zeros, int32, int64, uint16, cdouble, float64, rec,
-    copy, ones_like, where, all as alltrue, linspace,
-    sum, prod, sqrt, fmod, floor, ceil,
-    sin, cos, tan, arcsin, arccos, arctan, arctan2,
-    sinh, cosh, tanh, arcsinh, arccosh, arctanh,
-    log, log1p, log10, exp, expm1, conj)
 import numpy
-from numpy.testing import (assert_equal, assert_array_equal,
-                           assert_array_almost_equal, assert_allclose)
-from numpy import shape, allclose, array_equal, ravel, isnan, isinf
+import numpy as np
+from numpy import all as alltrue
+from numpy import (allclose, arange, arccos, arccosh, arcsin, arcsinh, arctan,
+                   arctan2, arctanh, array, array_equal, cdouble, ceil, conj,
+                   copy, copysign, cos, cosh, empty, exp, expm1, float64,
+                   floor, fmod, hypot, int32, int64, isfinite, isinf, isnan,
+                   linspace, log, log1p, log2, log10, maximum, minimum,
+                   nextafter, ones_like, prod, ravel, rec, round, shape, sign,
+                   signbit, sin, sinh, sqrt, sum, tan, tanh, trunc, uint16,
+                   where, zeros)
+from numpy.testing import (assert_allclose, assert_array_almost_equal,
+                           assert_array_equal, assert_equal)
 
 import numexpr
-from numexpr import E, NumExpr, evaluate, re_evaluate, validate, disassemble, use_vml
+from numexpr import (E, NumExpr, disassemble, evaluate, re_evaluate, use_vml,
+                     validate)
 from numexpr.expressions import ConstantNode
 from numexpr.utils import detect_number_of_cores
 
-import unittest
+try:
+    import pytest
+    pytest_available = True
+except ImportError:
+    pytest_available = False
 
 TestCase = unittest.TestCase
 
@@ -44,6 +52,15 @@
 MAX_THREADS = 16
 
 
+if not pytest_available:
+    def identity(f):
+        return f
+
+    pytest = MagicMock()
+    pytest.mark = MagicMock()
+    pytest.mark.thread_unsafe = identity
+
+
 class test_numexpr(TestCase):
     """Testing with 1 thread"""
     nthreads = 1
@@ -309,15 +326,28 @@ def test_where_scalar_bool(self):
         res = evaluate('where(a, b, c)')
         assert_array_equal(res, c)
 
+    # Comment out this test completely, as modern Python optimizes handling refcounts.
+    # See #511 for more info.
     @unittest.skipIf(hasattr(sys, "pypy_version_info"),
                      "PyPy does not have sys.getrefcount()")
-    def test_refcount(self):
+    def _test_refcount(self):
         # Regression test for issue #310
         a = array([1])
         assert sys.getrefcount(a) == 2
         evaluate('1')
         assert sys.getrefcount(a) == 2
 
+    # Test if `disable_cache` works correctly with refcount, see issue #521
+    # Comment out as modern Python optimizes handling refcounts.
+    @unittest.skipIf(hasattr(sys, "pypy_version_info"),
+                     "PyPy does not have sys.getrefcount()")
+    def _test_refcount_disable_cache(self):
+        a = array([1])
+        b = array([1])
+        evaluate('a', out=b, disable_cache=True)
+        assert sys.getrefcount(b) == 2
+
+    @pytest.mark.thread_unsafe
     def test_locals_clears_globals(self):
         # Check for issue #313, whereby clearing f_locals also clear f_globals
         # if in the top-frame. This cannot be done inside `unittest` as it is always
@@ -341,6 +371,7 @@ def test_locals_clears_globals(self):
 
 
 
+@pytest.mark.thread_unsafe
 class test_numexpr2(test_numexpr):
     """Testing with 2 threads"""
     nthreads = 2
@@ -437,6 +468,69 @@ def test_boolean_operator(self):
         else:
             raise ValueError("should raise exception!")
 
+        x = np.ones(10, dtype='bool')
+        y = np.zeros(10, dtype='bool')
+        assert_array_equal(evaluate("x & y"), x & y) # and
+        assert_array_equal(evaluate("x ^ y"), x ^ y) # xor
+        assert_array_equal(evaluate("x | y"), x | y) # or
+        assert_array_equal(evaluate("~x"), ~x) # invert
+
+    def test_bitwise_operators(self):
+        x = arange(10, dtype='i4')
+        y = arange(10, dtype='i4')
+        assert_array_equal(evaluate("x & y"), x & y) # and
+        assert_array_equal(evaluate("x ^ y"), x ^ y) # xor
+        assert_array_equal(evaluate("x | y"), x | y) # or
+        assert_array_equal(evaluate("~x"), ~x) # invert
+
+        x = arange(10, dtype='i8')
+        y = arange(10, dtype='i8')
+        assert_array_equal(evaluate("x & y"), x & y) # and
+        assert_array_equal(evaluate("x ^ y"), x ^ y) # xor
+        assert_array_equal(evaluate("x | y"), x | y) # or
+        assert_array_equal(evaluate("~x"), ~x) # invert
+
+    def test_complex_tan(self):
+        # old version of NumExpr had overflow problems
+        x = np.arange(1, 400., step=16., dtype=np.complex128)
+        y = 1j*np.arange(1, 400., step=16., dtype=np.complex128)
+        assert_array_almost_equal(evaluate("tan(x + y)"), tan(x + y))
+        assert_array_almost_equal(evaluate("tanh(x + y)"), tanh(x + y))
+
+    def test_maximum_minimum(self):
+        for dtype in [float, double, int, np.int64]:
+            x = arange(10, dtype=dtype)
+            y = 2 * arange(10, dtype=dtype)[::-1]
+            if dtype in (float, double):
+                y[5] = np.nan
+                x[2] = np.nan
+            assert_array_equal(evaluate("maximum(x,y)"), maximum(x,y))
+            assert_array_equal(evaluate("minimum(x,y)"), minimum(x,y))
+
+    def test_addmult_booleans(self):
+        x = np.asarray([0, 1, 0, 0, 1], dtype=bool)
+        y = x[::-1]
+        res_ne = evaluate("x * y")
+        res_np = x * y
+        assert_array_equal(res_ne, res_np)
+        assert res_ne.dtype == res_np.dtype
+        res_ne = evaluate("x + y")
+        res_np = x + y
+        assert_array_equal(res_ne, res_np)
+        assert res_ne.dtype == res_np.dtype
+
+    def test_sign_round(self):
+        for dtype in [float, double, np.int32, np.int64, complex]:
+            x = arange(10, dtype=dtype)
+            y = 2 * arange(10, dtype=dtype)[::-1]
+            r = x-y
+            if not np.issubdtype(dtype, np.integer):
+                r[-1] = np.nan
+            assert evaluate("round(r)").dtype == round(r).dtype
+            assert evaluate("sign(r)").dtype == sign(r).dtype
+            assert_array_equal(evaluate("sign(r)"), sign(r))
+            assert_array_equal(evaluate("round(r)"), round(r))
+
     def test_rational_expr(self):
         a = arange(1e6)
         b = arange(1e6) * 0.1
@@ -512,6 +606,7 @@ def test_illegal_value(self):
         else:
             self.fail()
 
+    @pytest.mark.thread_unsafe
     def test_sanitize(self):
         with _environment('NUMEXPR_SANITIZE', '1'):
             # Forbid dunder
@@ -583,11 +678,14 @@ def test_sanitize(self):
             evaluate('1.5j')
             evaluate('3.j')
 
+            #pass imaginary with scientific notation
+            evaluate('1.2e3+4.5e6j')
+
             # pass forbidden characters within quotes
             x = np.array(['a', 'b'], dtype=bytes)
             evaluate("x == 'b:'")
 
-
+    @pytest.mark.thread_unsafe
     def test_no_sanitize(self):
         try: # Errors on compile() after eval()
             evaluate('import os;', sanitize=False)
@@ -643,11 +741,15 @@ def test_negative_mod(self):
         n = np.array([-360, -360, -360, 360, 360, 360], dtype=np.int32)
         out_i = evaluate('a % n')
         assert_equal(out_i, np.mod(a, n))
+        main_i = evaluate('a // n')
+        assert_equal(main_i, a // n)
 
         b = a.astype(np.int64)
         m = n.astype(np.int64)
         out_l = evaluate('b % m')
         assert_equal(out_l, np.mod(b, m))
+        main_l = evaluate('b // m')
+        assert_equal(main_l, a // m)
 
     def test_negative_power_scalar(self):
         # Test for issue #428, where the power is negative and the base is an
@@ -660,20 +762,48 @@ def test_negative_power_scalar(self):
         out_l = evaluate('base ** -1.0')
         assert_equal(out_l, np.power(base, -1.0))
 
-
     def test_ex_uses_vml(self):
         vml_funcs = [ "sin", "cos", "tan", "arcsin", "arccos", "arctan",
                       "sinh", "cosh", "tanh", "arcsinh", "arccosh", "arctanh",
-                      "log", "log1p","log10", "exp", "expm1", "abs", "conj",
-                      "arctan2", "fmod"]
+                      "log", "log1p","log10", "log2", "exp", "expm1", "abs", "conj",
+                      "arctan2", "fmod", "hypot"]
         for func in vml_funcs:
             strexpr = func+'(a)'
             _, ex_uses_vml = numexpr.necompiler.getExprNames(strexpr, {})
             assert_equal(ex_uses_vml, use_vml, strexpr)
 
+    def test_bool_funcs(self):
+        # Test functions with boolean outputs
+        array_size = 100
+        dtype = np.float32
+        a = np.arange(2 * array_size, dtype=dtype)
+        a[array_size//2] = np.nan
+        a[array_size//3] = np.inf
+        a[array_size//4] = -2
+
+        assert_equal(evaluate("isnan(a)"), isnan(a))
+        assert_equal(evaluate("isfinite(a)"), isfinite(a))
+        assert_equal(evaluate("isinf(a)"), isinf(a))
+        assert_equal(evaluate("signbit(a)"), signbit(a))
+
+        a = a.astype(np.float64)
+        assert a.dtype == np.float64
+        assert_equal(evaluate("isnan(a)"), isnan(a))
+        assert_equal(evaluate("isfinite(a)"), isfinite(a))
+        assert_equal(evaluate("isinf(a)"), isinf(a))
+        assert_equal(evaluate("signbit(a)"), signbit(a))
+
+        a = a.astype(np.complex128)
+        assert a.dtype == np.complex128
+        assert np.all(evaluate("isnan(a)") == np.isnan(a))
+        assert np.all(evaluate("isfinite(a)") == np.isfinite(a))
+        assert np.all(evaluate("isinf(a)") == np.isinf(a))
+        # signbit not defined for complex numbers
+
     if 'sparc' not in platform.machine():
         # Execution order set here so as to not use too many threads
         # during the rest of the execution.  See #33 for details.
+        @pytest.mark.thread_unsafe
         def test_changing_nthreads_00_inc(self):
             a = linspace(-1, 1, 1000000)
             b = ((.25 * a + .75) * a - 1.5) * a - 2
@@ -682,6 +812,7 @@ def test_changing_nthreads_00_inc(self):
                 c = evaluate("((.25*a + .75)*a - 1.5)*a - 2")
                 assert_array_almost_equal(b, c)
 
+        @pytest.mark.thread_unsafe
         def test_changing_nthreads_01_dec(self):
             a = linspace(-1, 1, 1000000)
             b = ((.25 * a + .75) * a - 1.5) * a - 2
@@ -734,13 +865,13 @@ def test_changing_nthreads_01_dec(self):
 for func in ['copy', 'ones_like', 'sqrt',
              'sin', 'cos', 'tan', 'arcsin', 'arccos', 'arctan',
              'sinh', 'cosh', 'tanh', 'arcsinh', 'arccosh', 'arctanh',
-             'log', 'log1p', 'log10', 'exp', 'expm1', 'abs', 'conj',
-             'ceil', 'floor']:
+             'log', 'log1p', 'log10', "log2", 'exp', 'expm1', 'abs', 'conj',
+             'ceil', 'floor', 'round', 'trunc', 'sign']:
     func1tests.append("a + %s(b+c)" % func)
 tests.append(('1_ARG_FUNCS', func1tests))
 
 func2tests = []
-for func in ['arctan2', 'fmod']:
+for func in ['arctan2', 'fmod', 'hypot', 'nextafter', 'copysign']:
     func2tests.append("a + %s(b+c, d+1)" % func)
     func2tests.append("a + %s(b+c, 1)" % func)
     func2tests.append("a + %s(1, d+1)" % func)
@@ -783,109 +914,97 @@ def equal(a, b, exact):
 class Skip(Exception): pass
 
 
-def test_expressions():
-    test_no = [0]
+@pytest.mark.parametrize(
+    "expr,test_scalar,dtype,optimization,exact,section_name",
+    [
+        (expr, test_scalar, dtype, optimization, exact, section_name)
+        for test_scalar in (0, 1, 2)
+        for dtype in (int, int, np.float32, double, complex)
+        for optimization, exact in [
+            ("none", False),
+            ("moderate", False),
+            ("aggressive", False),
+        ]
+        for section_name, section_tests in tests
+        for expr in section_tests
+        if not (
+            dtype == complex
+            and (
+                "<" in expr
+                or ">" in expr
+                or "%" in expr
+                or "arctan2" in expr
+                or "fmod" in expr
+                or "hypot" in expr
+                or "nextafter" in expr
+                or "copysign" in expr
+                or "trunc" in expr
+                or "floor" in expr
+                or "ceil" in expr
+            )
+        )
+        if not (dtype in (int, int) and test_scalar and expr == "(a+1) ** -1")
+    ],
+)
+def test_expressions(
+    expr, test_scalar, dtype, optimization, exact, section_name
+):
+    array_size = 100
+    a = arange(2 * array_size, dtype=dtype)[::2]
+    a2 = zeros([array_size, array_size], dtype=dtype)
+    b = arange(array_size, dtype=dtype) / array_size
+    c = arange(array_size, dtype=dtype)
+    d = arange(array_size, dtype=dtype)
+    e = arange(array_size, dtype=dtype)
+    x = None
 
-    def make_test_method(a, a2, b, c, d, e, x, expr,
-                         test_scalar, dtype, optimization, exact, section):
-        this_locals = locals()
+    if dtype == complex:
+        a = a.real
+        for var in [a2, b, c, d, e]:
+            var += 1j
+            var *= 1 + 1j
 
-        def method():
-            try:
-                # We don't want to listen at RuntimeWarnings like
-                # "overflows" or "divide by zero" in plain eval().
-                warnings.simplefilter("ignore")
-                npval = eval(expr, globals(), this_locals)
-                warnings.simplefilter("always")
-                npval = eval(expr, globals(), this_locals)
-            except Exception as ex:
-                # just store the exception in a variable
-                # compatibility with numpy v1.12
-                # see also https://github.com/pydata/numexpr/issues/239
-                np_exception = ex
-                npval = None
-            else:
-                np_exception = None
+    if test_scalar == 1:
+        a = a[array_size // 2]
+    if test_scalar == 2:
+        b = b[array_size // 2]
 
-            try:
-                neval = evaluate(expr, local_dict=this_locals,
-                                 optimization=optimization)
-            except AssertionError:
-                raise
-            except NotImplementedError:
-                print('%r not implemented for %s (scalar=%d, opt=%s)'
-                      % (expr, dtype.__name__, test_scalar, optimization))
-            except Exception as ne_exception:
-                same_exc_type = issubclass(type(ne_exception),
-                                           type(np_exception))
-                if np_exception is None or not same_exc_type:
-                    print('numexpr error for expression %r' % (expr,))
-                    raise
-            except:
-                print('numexpr error for expression %r' % (expr,))
-                raise
-            else:
-                msg = ('expected numexpr error not raised for expression '
-                       '%r' % (expr,))
-                assert np_exception is None, msg
-
-                assert equal(npval, neval, exact), """%r
-(test_scalar=%r, dtype=%r, optimization=%r, exact=%r,
- npval=%r (%r - %r)\n neval=%r (%r - %r))""" % (expr, test_scalar, dtype.__name__,
-                                                optimization, exact,
-                                                npval, type(npval), shape(npval),
-                                                neval, type(neval), shape(neval))
-
-        method.description = ('test_expressions(%s, test_scalar=%r, '
-                              'dtype=%r, optimization=%r, exact=%r)') % (expr, test_scalar, dtype.__name__, optimization, exact)
-        test_no[0] += 1
-        method.__name__ = 'test_scalar%d_%s_%s_%s_%04d' % (test_scalar,
-                                                           dtype.__name__,
-                                                           optimization.encode('ascii'),
-                                                           section.encode('ascii'),
-                                                           test_no[0])
-        return method
+    # We don't want to listen at RuntimeWarnings like
+    # "overflows" or "divide by zero" in plain eval().
+    warnings.simplefilter("ignore")
+    try:
+        npexpr = expr
+        if "sign" in expr and dtype==complex and np.__version__<"2.0":
+            #definition of sign changed in numpy 2.0 for complex numbers
+            npexpr = expr.replace("sign(b+c)", "(b+c)/abs(b+c)")
+        npval = eval(npexpr, globals(), locals())
+    except Exception as ex:
+        np_exception = ex
+        npval = None
+    else:
+        np_exception = None
+    warnings.simplefilter("always")
 
-    x = None
-    for test_scalar in (0, 1, 2):
-        for dtype in (int, int, np.float32, double, complex):
-            array_size = 100
-            a = arange(2 * array_size, dtype=dtype)[::2]
-            a2 = zeros([array_size, array_size], dtype=dtype)
-            b = arange(array_size, dtype=dtype) / array_size
-            c = arange(array_size, dtype=dtype)
-            d = arange(array_size, dtype=dtype)
-            e = arange(array_size, dtype=dtype)
-            if dtype == complex:
-                a = a.real
-                for x in [a2, b, c, d, e]:
-                    x += 1j
-                    x *= 1 + 1j
-            if test_scalar == 1:
-                a = a[array_size // 2]
-            if test_scalar == 2:
-                b = b[array_size // 2]
-            for optimization, exact in [
-                ('none', False), ('moderate', False), ('aggressive', False)]:
-                for section_name, section_tests in tests:
-                    for expr in section_tests:
-                        if (dtype == complex and
-                            ('<' in expr or '>' in expr or '%' in expr
-                             or "arctan2" in expr or "fmod" in expr
-                             or "floor" in expr or "ceil" in expr)):
-                            # skip complex comparisons or functions not
-                            # defined in complex domain.
-                            continue
-                        if (dtype in (int, int) and test_scalar and
-                                    expr == '(a+1) ** -1'):
-                            continue
-
-                        m = make_test_method(a, a2, b, c, d, e, x,
-                                             expr, test_scalar, dtype,
-                                             optimization, exact,
-                                             section_name)
-                        yield m
+    try:
+        neval = evaluate(expr, local_dict=locals(), optimization=optimization)
+    except AssertionError:
+        raise
+    except NotImplementedError:
+        pytest.skip(
+            f"{expr!r} not implemented for {dtype.__name__} (scalar={test_scalar}, opt={optimization})"
+        )
+    except Exception as ne_exception:
+        same_exc_type = issubclass(type(ne_exception), type(np_exception))
+        if np_exception is None or not same_exc_type:
+            pytest.fail(f"numexpr error for expression {expr!r}")
+    else:
+        if np_exception is not None:
+            pytest.fail(f"expected numexpr error not raised for expression {expr!r}")
 
+        assert equal(npval, neval, exact), f"""{expr!r}
+            (test_scalar={test_scalar!r}, dtype={dtype.__name__!r}, optimization={optimization!r}, exact={exact!r},
+            npval={npval!r} ({type(npval)!r} - {shape(npval)!r})
+            neval={neval!r} ({type(neval)!r} - {shape(neval)!r}))"""
 
 class test_int64(TestCase):
     def test_neg(self):
@@ -1120,6 +1239,7 @@ def _environment(key, value):
             del os.environ[key]
 
 # Test cases for the threading configuration
+@pytest.mark.thread_unsafe
 class test_threading_config(TestCase):
     def test_max_threads_unset(self):
         # Has to be done in a subprocess as `importlib.reload` doesn't let us
@@ -1303,6 +1423,7 @@ def _worker(qout=None):
 
 # Case test for subprocesses (via multiprocessing module)
 class test_subprocess(TestCase):
+    @pytest.mark.thread_unsafe
     def test_multiprocess(self):
         try:
             import multiprocessing as mp
@@ -1325,9 +1446,10 @@ def test_multiprocess(self):
 def print_versions():
     """Print the versions of software that numexpr relies on."""
     # from pkg_resources import parse_version
-    from numexpr.cpuinfo import cpu
     import platform
 
+    from numexpr.cpuinfo import cpu
+
     print('-=' * 38)
     print('Numexpr version:   %s' % numexpr.__version__)
     print('NumPy version:     %s' % np.__version__)
@@ -1368,31 +1490,50 @@ def test(verbosity=1):
 
 
 def suite():
-    import unittest
     import platform as pl
+    import unittest
 
     theSuite = unittest.TestSuite()
     niter = 1
 
-    class TestExpressions(TestCase):
-        pass
+    # Add the pytest parametrized tests only if pytest is available
+    if pytest_available:
+        # Create a class that will run the test_expressions function with different parameters
+        class TestExpressions(unittest.TestCase):
+            pass
+
+        # Get the parameters from the pytest.mark.parametrize decorator
+        # This is safer than accessing internal pytest modules
+        marker = getattr(test_expressions, "pytestmark", None)
+        if marker and hasattr(marker[0], "args") and len(marker[0].args) >= 2:
+            param_list = marker[0].args[1]
 
-    def add_method(func):
-        def method(self):
-            return func()
+            # Create test methods dynamically
+            for i, params in enumerate(param_list):
+                expr, test_scalar, dtype, optimization, exact, section_name = params
 
-        setattr(TestExpressions, func.__name__,
-                method.__get__(None, TestExpressions))
+                def create_test_method(params=params):
+                    def test_method(self):
+                        expr, test_scalar, dtype, optimization, exact, section_name = (
+                            params
+                        )
+                        test_expressions(
+                            expr, test_scalar, dtype, optimization, exact, section_name
+                        )
 
-    for func in test_expressions():
-        add_method(func)
+                    return test_method
+
+                method_name = f"test_expr_{i}"
+                setattr(TestExpressions, method_name, create_test_method())
 
     for n in range(niter):
         theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(test_numexpr))
         if 'sparc' not in platform.machine():
             theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(test_numexpr2))
         theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(test_evaluate))
-        theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(TestExpressions))
+        # Add the dynamically created TestExpressions to the suite
+        if pytest_available:
+            theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(TestExpressions))
         theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(test_int32_int64))
         theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(test_uint32_int64))
         theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(test_strings))
@@ -1411,6 +1552,7 @@ def method(self):
         # interaction with threads and subprocess :-/
         theSuite.addTest(unittest.defaultTestLoader.loadTestsFromTestCase(test_threading))
 
+
     return theSuite
 
 
diff --git a/numexpr/utils.py b/numexpr/utils.py
index cc61833..9e45fbe 100644
--- a/numexpr/utils.py
+++ b/numexpr/utils.py
@@ -9,20 +9,22 @@
 ####################################################################
 
 import logging
+
 log = logging.getLogger(__name__)
 
+import contextvars
 import os
 import subprocess
-import contextvars
 
-from numexpr.interpreter import _set_num_threads, _get_num_threads, MAX_THREADS
 from numexpr import use_vml
+from numexpr.interpreter import MAX_THREADS, _get_num_threads, _set_num_threads
+
 from . import version
 
 if use_vml:
-    from numexpr.interpreter import (
-        _get_vml_version, _set_vml_accuracy_mode, _set_vml_num_threads,
-        _get_vml_num_threads)
+    from numexpr.interpreter import (_get_vml_num_threads, _get_vml_version,
+                                     _set_vml_accuracy_mode,
+                                     _set_vml_num_threads)
 
 
 def get_vml_version():
@@ -118,9 +120,9 @@ def get_num_threads():
 
 def _init_num_threads():
     """
-    Detects the environment variable 'NUMEXPR_MAX_THREADS' to set the threadpool 
-    size, and if necessary the slightly redundant 'NUMEXPR_NUM_THREADS' or 
-    'OMP_NUM_THREADS' env vars to set the initial number of threads used by 
+    Detects the environment variable 'NUMEXPR_MAX_THREADS' to set the threadpool
+    size, and if necessary the slightly redundant 'NUMEXPR_NUM_THREADS' or
+    'OMP_NUM_THREADS' env vars to set the initial number of threads used by
     the virtual machine.
     """
     # Any platform-specific short-circuits
@@ -140,7 +142,7 @@ def _init_num_threads():
         env_configured = True
         n_cores = MAX_THREADS
     else:
-        # The use has not set 'NUMEXPR_MAX_THREADS', so likely they have not 
+        # The use has not set 'NUMEXPR_MAX_THREADS', so likely they have not
         # configured NumExpr as desired, so we emit info logs.
         if n_cores > MAX_THREADS:
             log.info('Note: detected %d virtual cores but NumExpr set to maximum of %d, check "NUMEXPR_MAX_THREADS" environment variable.'%(n_cores, MAX_THREADS))
@@ -149,7 +151,7 @@ def _init_num_threads():
             log.info('Note: NumExpr detected %d cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.'%n_cores)
             n_cores = 16
 
-    # Now we check for 'NUMEXPR_NUM_THREADS' or 'OMP_NUM_THREADS' to set the 
+    # Now we check for 'NUMEXPR_NUM_THREADS' or 'OMP_NUM_THREADS' to set the
     # actual number of threads used.
     if 'NUMEXPR_NUM_THREADS' in os.environ and os.environ['NUMEXPR_NUM_THREADS'] != '':
         requested_threads = int(os.environ['NUMEXPR_NUM_THREADS'])
@@ -165,7 +167,7 @@ def _init_num_threads():
     set_num_threads(requested_threads)
     return requested_threads
 
-    
+
 def detect_number_of_cores():
     """
     Detects the number of cores on a system. Cribbed from pp.
diff --git a/numexpr/win32/stdint.h b/numexpr/win32/stdint.h
index b7e7112..c66267a 100644
--- a/numexpr/win32/stdint.h
+++ b/numexpr/win32/stdint.h
@@ -17,7 +17,7 @@
  *
  * mwb: This was modified in the following ways:
  *
- *      - make it compatible with Visual C++ 6 (which uses 
+ *      - make it compatible with Visual C++ 6 (which uses
  *          non-standard keywords and suffixes for 64-bit types)
  *      - some environments need stddef.h included (for wchar stuff?)
  *      - handle the fact that Microsoft's limits.h header defines
@@ -70,9 +70,9 @@ typedef unsigned   uint_least32_t;
 typedef __STDINT_LONGLONG  int_least64_t;
 typedef unsigned __STDINT_LONGLONG   uint_least64_t;
 
-/*  7.18.1.3  Fastest minimum-width integer types 
+/*  7.18.1.3  Fastest minimum-width integer types
  *  Not actually guaranteed to be fastest for all purposes
- *  Here we use the exact-width types for 8 and 16-bit ints. 
+ *  Here we use the exact-width types for 8 and 16-bit ints.
  */
 typedef char int_fast8_t;
 typedef unsigned char uint_fast8_t;
@@ -110,7 +110,7 @@ typedef unsigned __STDINT_LONGLONG   uintmax_t;
 #if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS)
 
 /* 7.18.2.1  Limits of exact-width integer types */
-#define INT8_MIN (-128) 
+#define INT8_MIN (-128)
 #define INT16_MIN (-32768)
 #define INT32_MIN (-2147483647 - 1)
 #define INT64_MIN  (PASTE( -9223372036854775807, __STDINT_LONGLONG_SUFFIX) - 1)
@@ -158,7 +158,7 @@ typedef unsigned __STDINT_LONGLONG   uintmax_t;
 #define UINT_FAST64_MAX UINT64_MAX
 
 /* 7.18.2.4  Limits of integer types capable of holding
-    object pointers */ 
+    object pointers */
 #ifdef _WIN64
 #define INTPTR_MIN INT64_MIN
 #define INTPTR_MAX INT64_MAX
@@ -186,7 +186,7 @@ typedef unsigned __STDINT_LONGLONG   uintmax_t;
 #define SIZE_MAX UINTPTR_MAX
 #endif
 
-#ifndef WCHAR_MIN  /* also in wchar.h */ 
+#ifndef WCHAR_MIN  /* also in wchar.h */
 #define WCHAR_MIN 0
 #define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */
 #endif
diff --git a/pyproject.toml b/pyproject.toml
index 9f2f025..264a999 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,6 @@
 [build-system]
 requires = [
-    "setuptools",
-    "wheel",
+    "setuptools>=77.0.0",
     "numpy>=2.0.0",
 ]
 build-backend = "setuptools.build_meta"
@@ -15,23 +14,25 @@ description = "Fast numerical expression evaluator for NumPy"
 readme = "README.rst"
 authors = [{name = "David M. Cooke, Francesc Alted, and others", email = "blosc@blosc.org"}]
 maintainers = [{ name = "Blosc Development Team", email = "blosc@blosc.org"}]
+license = "MIT"
+license-files = ["LICENSE.txt", "LICENSES/*"]
 classifiers = [
     "Development Status :: 6 - Mature",
     "Intended Audience :: Developers",
     "Intended Audience :: Information Technology",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: MIT License",
     "Programming Language :: Python",
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Operating System :: Microsoft :: Windows",
     "Operating System :: Unix",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 # Follow guidelines from https://scientific-python.org/specs/spec-0000/
 dependencies = [
     "numpy>=1.23.0",
@@ -43,8 +44,21 @@ homepage = "https://github.com/pydata/numexpr"
 documentation = "https://numexpr.readthedocs.io"
 repository = "https://github.com/pydata/numexpr"
 
+[dependency-groups]
+test = [
+    "pytest>=7.0.0",
+    "pytest-run-parallel>=0.6.0",
+]
+
 [tool.cibuildwheel]
-skip = "cp36-* cp37-* pp37-* cp38-* pp* *-manylinux_i686 *_ppc64le *_s390x"
+build-verbosity = 1
+skip = ["*-manylinux_i686", "*_ppc64le", "*_s390x"]
 # Let's use a more recent version of the manylinux image for more modern compilers
 manylinux-x86_64-image = "manylinux_2_28"
 manylinux-aarch64-image = "manylinux_2_28"
+test-groups = ["test"]
+test-command = ["python -m pytest --pyargs numexpr"]
+
+[[tool.cibuildwheel.overrides]]
+select = "cp31*t-*"
+test-command = ["python -m pytest --parallel-threads=4 --pyargs numexpr"]
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..4fec170
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    thread_unsafe: mark a test as thread unsafe
diff --git a/requirements.txt b/requirements.txt
index 1c52baf..a4c58eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-numpy >= 1.23.0 # keep in sync with NPY_TARGET_VERSION (setup.py)
+numpy >= 1.26.0 # keep in sync with NPY_TARGET_VERSION (setup.py)
diff --git a/setup.py b/setup.py
index 82f3651..64d9f20 100644
--- a/setup.py
+++ b/setup.py
@@ -9,12 +9,13 @@
 #  rights to use.
 ####################################################################
 
-import os, os.path as op
-import platform
 import configparser
-import numpy as np
-from setuptools import setup, Extension
+import os
+import os.path as op
+import platform
 
+import numpy as np
+from setuptools import Extension, setup
 
 with open('requirements.txt') as f:
     requirements = f.read().splitlines()
@@ -40,7 +41,7 @@
 libs = []  # Pre-built libraries ONLY, like python36.so
 clibs = []
 def_macros = [
-    # keep in sync with minimal runtime requirement (requirements.txt) 
+    # keep in sync with minimal runtime requirement (requirements.txt)
     ('NPY_TARGET_VERSION', 'NPY_1_23_API_VERSION')
 ]
 sources = ['numexpr/interpreter.cpp',
diff --git a/site.cfg.example b/site.cfg.example
index c8e2dfb..1c0de0b 100644
--- a/site.cfg.example
+++ b/site.cfg.example
@@ -2,10 +2,12 @@
 # file to "site.cfg" and edit the paths according to your installation of the
 # Intel MKL.
 
-# Example for Intel(R) MKL 2018 on Linux
+# Example for Intel(R) OneAPI MKL 2025 on Linux
+# When compiling (with e.g. `pip install -e. -v`), first do a:
+# $ source /opt/intel/oneapi/setvars.sh >/dev/null 2>&1 || true
 # [mkl]
-# library_dirs = /opt/intel/compilers_and_libraries_2018/linux/mkl/lib/intel64
-# include_dirs = /opt/intel/compilers_and_libraries_2018/linux/mkl/include
+# include_dirs = /opt/intel/oneapi/mkl/latest/include
+# library_dirs = /opt/intel/oneapi/mkl/latest/lib/intel64
 # libraries = mkl_rt