Updating OpenFace to version 2.2.0 (#741)

Change log: Moving to C++17. This means that the code can only be build using C++17 compilers (e.g. g++ >8 and Visual Studio 2017, clang > 5), fixing related bugs - (#698, #629, #641) Removing an explicit dependency on boost (all the filesystem operations are performed using std::filesystem or boost::filesysteme). If boost is available it will used boost:filesystem, otherwise std::filesystem (this requires C++17) Visual Studio 2017 is now the main version for Visual Studio builds, VS 2015 is no longer supported Updating OpenCV to 4.1 version (#511) Fixing a bug with output images when using GUI (#694) Updating RAWImage - #609, so it can be initialized from System.Drawing.Bitmap directly Fixing overlap detection for multi face tracking (#693)
2025-12-30 13:02:30 +00:00 · 2019-07-13 19:19:20 +01:00
parent f023667fee
commit e4a57e11d2
2800 changed files with 37819 additions and 353126 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,5 @@
+.git
+docker-compose.yml
 build
 matlab_runners
 matlab_version
@@ -5,4 +7,4 @@ python_scripts
 test-dump
 samples
 model_training
-gui
+gui
--- a/.env
+++ b/.env
@@ -0,0 +1,6 @@
+## This is read by docker-compose. You can overwrite these at runtime, e.g.:
+## DOCKERUSER=bobfoo docker-compose build
+
+DOCKERUSER=openface
+DOCKERTAG=latest
+DATA_MOUNT=/tmp/openface
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+lib/local/LandmarkDetector/model/patch_experts/cen_patches_*.dat
+
 matlab_version/experiments_menpo/out_semifrontal/
 /x64/Release/
 /x64/Debug/
@@ -100,3 +102,6 @@ lib/local/Utilities/Debug/
 lib/3rdParty/CameraEnumerator/Debug/
 lib/local/CppInerop/Debug/
 *.user
+
+# IDE-generated folders
+.idea
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,12 +1,13 @@
 language: cpp

-dist: trusty
+dist: xenial
 sudo: required

 branches:
  only:
    - master
    - develop
+    - feature/opencv4
 compiler:
  - gcc

@@ -16,14 +17,17 @@ os:

 before_install:

-  # OpenCV dependencies, dlib and boost
+  # G++ 8 compiler, OpenCV dependencies, dlib, openblas, and tbb
  - if [ ${TRAVIS_OS_NAME} = linux ]; then  
-      sudo apt-get update;
-      sudo apt-get install libopenblas-dev;
+      cd ../;
+      sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test;
+      sudo apt-get update -qq;
+      sudo apt-get install -qq g++-8;
+      sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 90;
      sudo apt-get install git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev;
      sudo apt-get install python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev libdc1394-22-dev;
      sudo apt-get install cmake;
-      sudo apt-get install libboost-all-dev;
+      sudo apt-get install libopenblas-dev;
      wget http://dlib.net/files/dlib-19.13.tar.bz2;
      tar xf dlib-19.13.tar.bz2;
      cd dlib-19.13;
@@ -34,14 +38,14 @@ before_install:
      sudo make install;
      sudo ldconfig;
      cd ../..;
-      wget https://github.com/opencv/opencv/archive/3.4.0.zip;
-      unzip -qq 3.4.0.zip;
-      cd opencv-3.4.0;
+      wget https://github.com/opencv/opencv/archive/4.0.0.zip;
+      unzip -qq 4.0.0.zip;
+      cd opencv-4.0.0;
      mkdir build;
      cd build;
    fi

-  # g++4.8.1
+  # g++ TODO these should not be separated?
  - if [ "$CXX" = "g++" ]; then
      if [ ${TRAVIS_OS_NAME} = linux ]; then
        $CXX --version;
@@ -52,7 +56,7 @@ before_install:
      fi 
    fi

-  # clang 3.4
+  # clang
  - if [ "$CXX" = "clang++" ]; then
      if [ ${TRAVIS_OS_NAME} = linux ]; then  
        $CXX --version;
@@ -68,16 +72,18 @@ before_install:
      brew install tbb;
      brew install openblas;
      brew install dlib;
-      brew install opencv3;
+      brew install opencv;
+      cd ../;
    fi
  
 script:
+  - cd OpenFace
  - $CXX --version
  - chmod 777 ./download_models.sh
  - ./download_models.sh
  - mkdir build
  - cd build
-  - cmake -D CMAKE_BUILD_TYPE=RELEASE CMAKE_CXX_FLAGS="-std=c++11" -D CMAKE_EXE_LINKER_FLAGS="-std=c++11" .. 
+  - cmake -D CMAKE_BUILD_TYPE=RELEASE .. 
  - make
  - ../build/bin/FeatureExtraction -f "../samples/2015-10-15-15-14.avi" -q -mloc model/main_clm_general.txt
  - ../build/bin/FaceLandmarkImg -fdir ../samples -out_dir data -multi_view 1 -wild -q
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,6 @@
-cmake_minimum_required (VERSION 3.2)
+cmake_minimum_required (VERSION 3.8)
+set(CMAKE_CXX_STANDARD 17)
+
 project(OpenFace VERSION 2.0.2)

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/)
@@ -27,7 +29,7 @@ else()
    MESSAGE("  OpenBLAS_INCLUDE: ${OpenBLAS_INCLUDE_DIR}")
 endif()

-find_package( OpenCV 3.3 REQUIRED COMPONENTS core imgproc calib3d highgui objdetect)
+find_package( OpenCV 4.0 REQUIRED COMPONENTS core imgproc calib3d highgui objdetect)
 if(${OpenCV_FOUND})
 	MESSAGE("OpenCV information:") 
 	MESSAGE("  OpenCV_INCLUDE_DIRS: ${OpenCV_INCLUDE_DIRS}") 
@@ -37,7 +39,7 @@ else()
    MESSAGE(FATAL_ERROR "OpenCV not found in the system.")
 endif()

-find_package( Boost 1.5.9 REQUIRED COMPONENTS filesystem system)
+find_package( Boost 1.5.9 COMPONENTS filesystem system)
 if(${Boost_FOUND})
 	MESSAGE("Boost information:") 
 	MESSAGE("  Boost_VERSION: ${Boost_VERSION}")
@@ -45,9 +47,10 @@ if(${Boost_FOUND})
 	MESSAGE("  Boost_LIBRARIES: ${Boost_LIBRARIES}") 
 	MESSAGE("  Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}") 
 else()
-    MESSAGE(FATAL_ERROR "Boost not found in the system.")
+    MESSAGE("Boost not found in the system.")
 endif()

+
 # Move LandmarkDetector model
 file(GLOB files "lib/local/LandmarkDetector/model/*.txt")
 foreach(file ${files})
@@ -134,13 +137,11 @@ endforeach()

 if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
    execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (GCC_VERSION VERSION_LESS 4.7)
-        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x -msse -msse2 -msse3")
+    if (GCC_VERSION VERSION_LESS 8.0)
+		MESSAGE(FATAL_ERROR "Need a 8.0 or newer GCC compiler. Current GCC: ${GCC_VERSION}")
    else ()
-        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -msse -msse2 -msse3")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3")
    endif ()
-else ()
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -msse -msse2 -msse3")
 endif ()

 # dlib
--- a/63
+++ b/63
@@ -1,63 +0,0 @@
-FROM ubuntu:14.04 as build
-
-LABEL maintainer="Edgar Aroutiounian <edgar.factorial@gmail.com>"
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-ARG BUILD_DIR=/home/build-dep
-
-ARG OPENFACE_DIR=/home/openface-build
-
-RUN mkdir ${OPENFACE_DIR}
-WORKDIR ${OPENFACE_DIR}
-
-COPY ./CMakeLists.txt ${OPENFACE_DIR}
-
-COPY ./cmake ${OPENFACE_DIR}/cmake
-
-COPY ./exe ${OPENFACE_DIR}/exe
-
-COPY ./lib ${OPENFACE_DIR}/lib
-
-ADD https://www.dropbox.com/s/7na5qsjzz8yfoer/cen_patches_0.25_of.dat?dl=1 \
-    ${OPENFACE_DIR}/lib/local/LandmarkDetector/model/patch_experts/cen_patches_0.25_of.dat
-
-ADD https://www.dropbox.com/s/k7bj804cyiu474t/cen_patches_0.35_of.dat?dl=1 \
-    ${OPENFACE_DIR}/lib/local/LandmarkDetector/model/patch_experts/cen_patches_0.35_of.dat
-
-ADD https://www.dropbox.com/s/ixt4vkbmxgab1iu/cen_patches_0.50_of.dat?dl=1 \
-    ${OPENFACE_DIR}/lib/local/LandmarkDetector/model/patch_experts/cen_patches_0.50_of.dat
-
-ADD https://www.dropbox.com/s/2t5t1sdpshzfhpj/cen_patches_1.00_of.dat?dl=1 \
-    ${OPENFACE_DIR}/lib/local/LandmarkDetector/model/patch_experts/cen_patches_1.00_of.dat
-
-RUN mkdir ${BUILD_DIR}
-
-ADD https://github.com/opencv/opencv/archive/3.4.0.zip ${BUILD_DIR}
-
-RUN apt-get update && apt-get install -qq -y \
-    curl build-essential llvm clang-3.7 libc++-dev \
-    libc++abi-dev cmake libopenblas-dev liblapack-dev git libgtk2.0-dev \
-    pkg-config libavcodec-dev libavformat-dev libswscale-dev \
-    python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev \
-    libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev checkinstall \
-    libboost-all-dev wget unzip && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN cd ${BUILD_DIR} && unzip 3.4.0.zip && \
-    cd opencv-3.4.0 && \
-    mkdir -p build && \
-    cd build && \
-    cmake -D CMAKE_BUILD_TYPE=RELEASE \
-    -D CMAKE_INSTALL_PREFIX=/usr/local \
-    -D WITH_TBB=ON -D WITH_CUDA=OFF \
-    -D BUILD_SHARED_LIBS=OFF .. && \
-    make -j4 && \
-    make install
-
-RUN cd ${OPENFACE_DIR} && mkdir -p build && cd build && \
-    cmake -D CMAKE_BUILD_TYPE=RELEASE .. && \
-    make -j4
-
-RUN ln /dev/null /dev/raw1394
-ENTRYPOINT ["/bin/bash"]
--- a/OpenFace.sln
+++ b/OpenFace.sln
@@ -1,7 +1,7 @@

 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 14
-VisualStudioVersion = 14.0.25420.1
+# Visual Studio 17
+VisualStudioVersion = 15.8.2
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LandmarkDetector", "lib\local\LandmarkDetector\LandmarkDetector.vcxproj", "{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}"
 EndProject
@@ -179,4 +179,7 @@ Global
 		{78196985-EE54-411F-822B-5A23EDF80642} = {652CCE53-4997-4B43-9A99-28D075199C99}
 		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC} = {652CCE53-4997-4B43-9A99-28D075199C99}
 	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {228609CD-6688-47E7-8D5B-5EE684F8A7A1}
+	EndGlobalSection
 EndGlobal
--- a/OpenFace_vs2017.sln
+++ b/OpenFace_vs2017.sln
@@ -1,182 +0,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 14
-VisualStudioVersion = 14.0.25420.1
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LandmarkDetector", "lib\local\LandmarkDetector\LandmarkDetector.vcxproj", "{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FaceAnalyser", "lib\local\FaceAnalyser\FaceAnalyser.vcxproj", "{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FeatureExtraction", "exe\FeatureExtraction\FeatureExtraction.vcxproj", "{8A23C00D-767D-422D-89A3-CF225E3DAB4B}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Libraries", "Libraries", "{99FEBA13-BDDF-4076-B57E-D8EF4076E20D}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Executables", "Executables", "{9961DDAC-BE6E-4A6E-8EEF-FFC7D67BD631}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FaceLandmarkVidMulti", "exe\FaceLandmarkVidMulti\FaceLandmarkVidMulti.vcxproj", "{C3FAF36F-44BC-4454-87C2-C5106575FE50}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Recording", "exe\Recording\Recording.vcxproj", "{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FaceLandmarkVid", "exe\FaceLandmarkVid\FaceLandmarkVid.vcxproj", "{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FaceLandmarkImg", "exe\FaceLandmarkImg\FaceLandmarkImg.vcxproj", "{DDC3535E-526C-44EC-9DF4-739E2D3A323B}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GazeAnalyser", "lib\local\GazeAnalyser\GazeAnalyser.vcxproj", "{5F915541-F531-434F-9C81-79F5DB58012B}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "UtilLibs", "UtilLibs", "{652CCE53-4997-4B43-9A99-28D075199C99}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Utilities", "lib\local\Utilities\Utilities.vcxproj", "{8E741EA2-9386-4CF2-815E-6F9B08991EAC}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "gui", "gui", "{E59CF005-539F-484F-9AA6-9F08AC2DB31E}"
-EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeadPoseLive", "gui\HeadPose-live\HeadPoseLive.csproj", "{F396362D-821E-4EA6-9BBF-1F6050844118}"
-EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenFaceDemo", "gui\OpenFaceDemo\OpenFaceDemo.csproj", "{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}"
-EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenFaceOffline", "gui\OpenFaceOffline\OpenFaceOffline.csproj", "{A4760F41-2B1F-4144-B7B2-62785AFFE79B}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CppInerop", "lib\local\CppInerop\CppInerop.vcxproj", "{78196985-EE54-411F-822B-5A23EDF80642}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CameraEnumerator", "lib\3rdParty\CameraEnumerator\CameraEnumerator.vcxproj", "{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Debug|x64 = Debug|x64
-		Release|Win32 = Release|Win32
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}.Debug|Win32.ActiveCfg = Debug|Win32
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}.Debug|Win32.Build.0 = Debug|Win32
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}.Debug|x64.ActiveCfg = Debug|x64
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}.Debug|x64.Build.0 = Debug|x64
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}.Release|Win32.ActiveCfg = Release|Win32
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}.Release|Win32.Build.0 = Release|Win32
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}.Release|x64.ActiveCfg = Release|x64
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8}.Release|x64.Build.0 = Release|x64
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}.Debug|Win32.ActiveCfg = Debug|Win32
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}.Debug|Win32.Build.0 = Debug|Win32
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}.Debug|x64.ActiveCfg = Debug|x64
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}.Debug|x64.Build.0 = Debug|x64
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}.Release|Win32.ActiveCfg = Release|Win32
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}.Release|Win32.Build.0 = Release|Win32
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}.Release|x64.ActiveCfg = Release|x64
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7}.Release|x64.Build.0 = Release|x64
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B}.Debug|Win32.ActiveCfg = Debug|Win32
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B}.Debug|Win32.Build.0 = Debug|Win32
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B}.Debug|x64.ActiveCfg = Debug|x64
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B}.Debug|x64.Build.0 = Debug|x64
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B}.Release|Win32.ActiveCfg = Release|Win32
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B}.Release|Win32.Build.0 = Release|Win32
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B}.Release|x64.ActiveCfg = Release|x64
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B}.Release|x64.Build.0 = Release|x64
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50}.Debug|Win32.ActiveCfg = Debug|Win32
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50}.Debug|Win32.Build.0 = Debug|Win32
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50}.Debug|x64.ActiveCfg = Debug|x64
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50}.Debug|x64.Build.0 = Debug|x64
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50}.Release|Win32.ActiveCfg = Release|Win32
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50}.Release|Win32.Build.0 = Release|Win32
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50}.Release|x64.ActiveCfg = Release|x64
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50}.Release|x64.Build.0 = Release|x64
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}.Debug|Win32.ActiveCfg = Debug|Win32
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}.Debug|Win32.Build.0 = Debug|Win32
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}.Debug|x64.ActiveCfg = Debug|x64
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}.Debug|x64.Build.0 = Debug|x64
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}.Release|Win32.ActiveCfg = Release|Win32
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}.Release|Win32.Build.0 = Release|Win32
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}.Release|x64.ActiveCfg = Release|x64
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}.Release|x64.Build.0 = Release|x64
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}.Debug|Win32.ActiveCfg = Debug|Win32
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}.Debug|Win32.Build.0 = Debug|Win32
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}.Debug|x64.ActiveCfg = Debug|x64
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}.Debug|x64.Build.0 = Debug|x64
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}.Release|Win32.ActiveCfg = Release|Win32
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}.Release|Win32.Build.0 = Release|Win32
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}.Release|x64.ActiveCfg = Release|x64
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A}.Release|x64.Build.0 = Release|x64
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B}.Debug|Win32.ActiveCfg = Debug|Win32
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B}.Debug|Win32.Build.0 = Debug|Win32
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B}.Debug|x64.ActiveCfg = Debug|x64
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B}.Debug|x64.Build.0 = Debug|x64
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B}.Release|Win32.ActiveCfg = Release|Win32
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B}.Release|Win32.Build.0 = Release|Win32
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B}.Release|x64.ActiveCfg = Release|x64
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B}.Release|x64.Build.0 = Release|x64
-		{5F915541-F531-434F-9C81-79F5DB58012B}.Debug|Win32.ActiveCfg = Debug|Win32
-		{5F915541-F531-434F-9C81-79F5DB58012B}.Debug|Win32.Build.0 = Debug|Win32
-		{5F915541-F531-434F-9C81-79F5DB58012B}.Debug|x64.ActiveCfg = Debug|x64
-		{5F915541-F531-434F-9C81-79F5DB58012B}.Debug|x64.Build.0 = Debug|x64
-		{5F915541-F531-434F-9C81-79F5DB58012B}.Release|Win32.ActiveCfg = Release|Win32
-		{5F915541-F531-434F-9C81-79F5DB58012B}.Release|Win32.Build.0 = Release|Win32
-		{5F915541-F531-434F-9C81-79F5DB58012B}.Release|x64.ActiveCfg = Release|x64
-		{5F915541-F531-434F-9C81-79F5DB58012B}.Release|x64.Build.0 = Release|x64
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC}.Debug|Win32.ActiveCfg = Debug|Win32
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC}.Debug|Win32.Build.0 = Debug|Win32
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC}.Debug|x64.ActiveCfg = Debug|x64
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC}.Debug|x64.Build.0 = Debug|x64
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC}.Release|Win32.ActiveCfg = Release|Win32
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC}.Release|Win32.Build.0 = Release|Win32
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC}.Release|x64.ActiveCfg = Release|x64
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC}.Release|x64.Build.0 = Release|x64
-		{F396362D-821E-4EA6-9BBF-1F6050844118}.Debug|Win32.ActiveCfg = Debug|x86
-		{F396362D-821E-4EA6-9BBF-1F6050844118}.Debug|Win32.Build.0 = Debug|x86
-		{F396362D-821E-4EA6-9BBF-1F6050844118}.Debug|x64.ActiveCfg = Debug|Any CPU
-		{F396362D-821E-4EA6-9BBF-1F6050844118}.Debug|x64.Build.0 = Debug|Any CPU
-		{F396362D-821E-4EA6-9BBF-1F6050844118}.Release|Win32.ActiveCfg = Release|x86
-		{F396362D-821E-4EA6-9BBF-1F6050844118}.Release|Win32.Build.0 = Release|x86
-		{F396362D-821E-4EA6-9BBF-1F6050844118}.Release|x64.ActiveCfg = Release|Any CPU
-		{F396362D-821E-4EA6-9BBF-1F6050844118}.Release|x64.Build.0 = Release|Any CPU
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}.Debug|Win32.ActiveCfg = Debug|x86
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}.Debug|Win32.Build.0 = Debug|x86
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}.Debug|x64.ActiveCfg = Debug|Any CPU
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}.Debug|x64.Build.0 = Debug|Any CPU
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}.Release|Win32.ActiveCfg = Release|x86
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}.Release|Win32.Build.0 = Release|x86
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}.Release|x64.ActiveCfg = Release|Any CPU
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90}.Release|x64.Build.0 = Release|Any CPU
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B}.Debug|Win32.ActiveCfg = Debug|x86
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B}.Debug|Win32.Build.0 = Debug|x86
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B}.Debug|x64.ActiveCfg = Debug|Any CPU
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B}.Debug|x64.Build.0 = Debug|Any CPU
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B}.Release|Win32.ActiveCfg = Release|x86
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B}.Release|Win32.Build.0 = Release|x86
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B}.Release|x64.ActiveCfg = Release|Any CPU
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B}.Release|x64.Build.0 = Release|Any CPU
-		{78196985-EE54-411F-822B-5A23EDF80642}.Debug|Win32.ActiveCfg = Debug|Win32
-		{78196985-EE54-411F-822B-5A23EDF80642}.Debug|Win32.Build.0 = Debug|Win32
-		{78196985-EE54-411F-822B-5A23EDF80642}.Debug|x64.ActiveCfg = Debug|x64
-		{78196985-EE54-411F-822B-5A23EDF80642}.Debug|x64.Build.0 = Debug|x64
-		{78196985-EE54-411F-822B-5A23EDF80642}.Release|Win32.ActiveCfg = Release|Win32
-		{78196985-EE54-411F-822B-5A23EDF80642}.Release|Win32.Build.0 = Release|Win32
-		{78196985-EE54-411F-822B-5A23EDF80642}.Release|x64.ActiveCfg = Release|x64
-		{78196985-EE54-411F-822B-5A23EDF80642}.Release|x64.Build.0 = Release|x64
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}.Debug|Win32.ActiveCfg = Debug|Win32
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}.Debug|Win32.Build.0 = Debug|Win32
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}.Debug|x64.ActiveCfg = Debug|x64
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}.Debug|x64.Build.0 = Debug|x64
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}.Release|Win32.ActiveCfg = Release|Win32
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}.Release|Win32.Build.0 = Release|Win32
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}.Release|x64.ActiveCfg = Release|x64
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(NestedProjects) = preSolution
-		{BDC1D107-DE17-4705-8E7B-CDDE8BFB2BF8} = {99FEBA13-BDDF-4076-B57E-D8EF4076E20D}
-		{0E7FC556-0E80-45EA-A876-DDE4C2FEDCD7} = {99FEBA13-BDDF-4076-B57E-D8EF4076E20D}
-		{8A23C00D-767D-422D-89A3-CF225E3DAB4B} = {9961DDAC-BE6E-4A6E-8EEF-FFC7D67BD631}
-		{C3FAF36F-44BC-4454-87C2-C5106575FE50} = {9961DDAC-BE6E-4A6E-8EEF-FFC7D67BD631}
-		{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC} = {9961DDAC-BE6E-4A6E-8EEF-FFC7D67BD631}
-		{34032CF2-1B99-4A25-9050-E9C13DD4CD0A} = {9961DDAC-BE6E-4A6E-8EEF-FFC7D67BD631}
-		{DDC3535E-526C-44EC-9DF4-739E2D3A323B} = {9961DDAC-BE6E-4A6E-8EEF-FFC7D67BD631}
-		{5F915541-F531-434F-9C81-79F5DB58012B} = {99FEBA13-BDDF-4076-B57E-D8EF4076E20D}
-		{8E741EA2-9386-4CF2-815E-6F9B08991EAC} = {652CCE53-4997-4B43-9A99-28D075199C99}
-		{F396362D-821E-4EA6-9BBF-1F6050844118} = {E59CF005-539F-484F-9AA6-9F08AC2DB31E}
-		{E143A2AA-312E-4DFE-B61D-9A87CCBC8E90} = {E59CF005-539F-484F-9AA6-9F08AC2DB31E}
-		{A4760F41-2B1F-4144-B7B2-62785AFFE79B} = {E59CF005-539F-484F-9AA6-9F08AC2DB31E}
-		{78196985-EE54-411F-822B-5A23EDF80642} = {652CCE53-4997-4B43-9A99-28D075199C99}
-		{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC} = {652CCE53-4997-4B43-9A99-28D075199C99}
-	EndGlobalSection
-EndGlobal
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,8 +1,11 @@
 version: 1.0.{build}
+image: Visual Studio 2017
 branches:
  only:
  - develop
+  - feature/opencv4
  - master
+  - feature/boost_removal
 max_jobs: 4  
 configuration:
 - Release
@@ -12,19 +15,9 @@ platform:
 - Win32
 # scripts that run after cloning repository
 install:
-  - ps: $source = "https://www.dropbox.com/s/7na5qsjzz8yfoer/cen_patches_0.25_of.dat?dl=1"
-  - ps: $destination = "lib/local/LandmarkDetector/model/patch_experts/cen_patches_0.25_of.dat"
-  - ps: Invoke-WebRequest $source -OutFile $destination
-  - ps: $source = "https://www.dropbox.com/s/k7bj804cyiu474t/cen_patches_0.35_of.dat?dl=1"
-  - ps: $destination = "lib/local/LandmarkDetector/model/patch_experts/cen_patches_0.35_of.dat"
-  - ps: Invoke-WebRequest $source -OutFile $destination
-  - ps: $source = "https://www.dropbox.com/s/ixt4vkbmxgab1iu/cen_patches_0.50_of.dat?dl=1"
-  - ps: $destination = "lib/local/LandmarkDetector/model/patch_experts/cen_patches_0.50_of.dat"
-  - ps: Invoke-WebRequest $source -OutFile $destination
-  - ps: $source = "https://www.dropbox.com/s/2t5t1sdpshzfhpj/cen_patches_1.00_of.dat?dl=1"
-  - ps: $destination = "lib/local/LandmarkDetector/model/patch_experts/cen_patches_1.00_of.dat"
-  - ps: Invoke-WebRequest $source -OutFile $destination
-
+  - ps: '& ".\download_libraries.ps1"'
+  - ps: '& ".\download_models.ps1"'
+  
 build:
  project: OpenFace.sln
  verbosity: minimal
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,24 @@
+---
+# Variables can be set from the shell: `DOCKERTAG=foo docker-compose build`
+# or from a .env file. See docker-compose documentation for details
+# Variable DOCKERUSER should be set to your dockerhub user
+# Alternatively, use a docker registry url as the image name
+version: '3.7'
+services:
+  ## Image runtime service
+  ## This can be used to add volume mounts or pass environment variables
+  ## Todo: make a service which can use the container interactively
+  openface:
+    container_name: openface
+    build:
+      context: .
+      dockerfile: docker/Dockerfile
+    image: "${DOCKERUSER}/openface:${DOCKERTAG}"
+    tty: true
+    volumes:
+      - "${DATA_MOUNT}:${DATA_MOUNT}"
+    environment:
+      DATA_MOUNT: "${DATA_MOUNT}"
+    command: ["bash"]
+
+...
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -0,0 +1,110 @@
+# ==================== Building Model Layer ===========================
+# This is a little trick to improve caching and minimize rebuild time
+# and bandwidth. Note that RUN commands only cache-miss if the prior layers
+# miss, or the dockerfile changes prior to this step.
+# To update these patch files, be sure to run build with --no-cache
+FROM alpine as model_data
+RUN apk --no-cache --update-cache add wget
+WORKDIR /data/patch_experts
+
+RUN wget -q https://www.dropbox.com/s/7na5qsjzz8yfoer/cen_patches_0.25_of.dat &&\
+    wget -q https://www.dropbox.com/s/k7bj804cyiu474t/cen_patches_0.35_of.dat &&\
+    wget -q https://www.dropbox.com/s/ixt4vkbmxgab1iu/cen_patches_0.50_of.dat &&\
+    wget -q https://www.dropbox.com/s/2t5t1sdpshzfhpj/cen_patches_1.00_of.dat
+
+## ==================== Install Ubuntu Base libs ===========================
+## This will be our base image for OpenFace, and also the base for the compiler
+## image. We only need packages which are linked
+
+FROM ubuntu:18.04 as ubuntu_base
+
+LABEL maintainer="Michael McDermott <mikemcdermott23@gmail.com>"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# todo: minimize this even more
+RUN apt-get update -qq &&\
+    apt-get install -qq curl &&\
+    apt-get install -qq --no-install-recommends \
+        libopenblas-dev liblapack-dev \
+        libavcodec-dev libavformat-dev libswscale-dev \
+        libtbb2 libtbb-dev libjpeg-dev \
+        libpng-dev libtiff-dev &&\
+    rm -rf /var/lib/apt/lists/*
+
+## ==================== Build-time dependency libs ======================
+## This will build and install opencv and dlib into an additional dummy
+## directory, /root/diff, so we can later copy in these artifacts,
+## minimizing docker layer size
+## Protip: ninja is faster than `make -j` and less likely to lock up system
+FROM ubuntu_base as cv_deps
+
+WORKDIR /root/build-dep
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update -qq && apt-get install -qq -y \
+        cmake ninja-build pkg-config build-essential checkinstall\
+        g++-8 &&\
+    rm -rf /var/lib/apt/lists/* &&\
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+##        llvm clang-3.7 libc++-dev libc++abi-dev  \
+## ==================== Building dlib ===========================
+
+RUN curl http://dlib.net/files/dlib-19.13.tar.bz2 -LO &&\
+    tar xf dlib-19.13.tar.bz2 && \
+    rm dlib-19.13.tar.bz2 &&\
+    mv dlib-19.13 dlib &&\
+    mkdir -p dlib/build &&\
+    cd dlib/build &&\
+    cmake -DCMAKE_BUILD_TYPE=Release -G Ninja .. &&\
+    ninja && \
+    ninja install && \
+    DESTDIR=/root/diff ninja install &&\
+    ldconfig
+
+## ==================== Building OpenCV ======================
+ENV OPENCV_VERSION=4.1.0
+
+RUN curl https://github.com/opencv/opencv/archive/${OPENCV_VERSION}.tar.gz -LO &&\
+    tar xf ${OPENCV_VERSION}.tar.gz && \
+    rm ${OPENCV_VERSION}.tar.gz &&\
+    mv opencv-${OPENCV_VERSION} opencv && \
+    mkdir -p opencv/build && \
+    cd opencv/build && \
+    cmake -D CMAKE_BUILD_TYPE=RELEASE \
+        -D CMAKE_INSTALL_PREFIX=/usr/local \
+        -D WITH_TBB=ON -D WITH_CUDA=OFF \
+        -DWITH_QT=OFF -DWITH_GTK=OFF\
+        -G Ninja .. && \
+    ninja && \
+    ninja install &&\
+    DESTDIR=/root/diff ninja install
+
+## ==================== Building OpenFace ===========================
+FROM cv_deps as openface
+WORKDIR /root/openface
+
+COPY ./ ./
+
+COPY --from=model_data /data/patch_experts/* \
+    /root/openface/lib/local/LandmarkDetector/model/patch_experts/
+
+RUN mkdir -p build && cd build && \
+    cmake -D CMAKE_BUILD_TYPE=RELEASE -G Ninja .. && \
+    ninja &&\
+    DESTDIR=/root/diff ninja install
+
+
+## ==================== Streamline container ===========================
+## Clean up - start fresh and only copy in necessary stuff
+## This shrinks the image from ~8 GB to ~1.6 GB
+FROM ubuntu_base as final
+
+WORKDIR /root
+
+# Copy in only necessary libraries
+COPY --from=openface /root/diff /
+
+# Since we "imported" the build artifacts, we need to reconfigure ld
+RUN ldconfig
--- a/docker/README.md
+++ b/docker/README.md
@@ -0,0 +1,57 @@
+# Docker building instructions
+
+This image can be build with just `docker`, but it is highly recommend to use 
+`docker-compose` as this greatly simplifies and improves the process.  
+
+## Quick start
+
+To start with the container hosted by the repo maintainer, run  
+
+`docker run -it --rm --name openface algebr/openface:latest`
+
+This will drop you into a shell with binaries such as FaceLandmarkImg. For example,
+try this code (in the container):
+
+```bash
+curl -o tesla.jpg https://upload.wikimedia.org/wikipedia/commons/thumb/b/b7/Nicola_Tesla_LCCN2014684845.jpg/559px-Nicola_Tesla_LCCN2014684845.jpg 
+FaceLandmarkImg -f tesla.jpg
+```
+
+Then, copy the output to the host system (from host terminal):
+
+```bash
+docker cp openface:/root/processed /tmp/
+cd /tmp/processed
+```
+
+Tip: On Ubuntu and other *nixes with X running, you can open a file directly 
+like this:
+```bash
+xdg-open /tmp/processed/tesla.jpg
+``` 
+
+## Building
+
+In repo root, run `docker-compose build` to automatically build and tag. 
+There are two variables which can be used to modify the tag, `$DOCKERUSER` and
+`$DOCKERTAG`. DC will automatically tag image as 
+`${DOCKERUSER}/openface:${DOCKERTAG}`
+
+## OpenFace service (in progress)
+
+To run OpenFace like a service, you can start the container with bind mounts 
+in order to pass data into and out of the container easily. 
+`$DATA_MOUNT` by default is set to `/tmp/openface`. This can be overridden by 
+modifying `.env` file, setting it in your shell environment, or passing in 
+before `docker-compose` at runtime. Note: output will be `root` owner. 
+
+```bash
+export DATA_MOUNT=/tmp/openface
+mkdir -p $DATA_MOUNT/tesla # this is just to ensure this is writable by user
+docker-compose up -d openface && sync # sync is to wait till service starts
+curl -o $DATA_MOUNT/tesla.jpg \
+    https://upload.wikimedia.org/wikipedia/commons/thumb/b/b7/Nicola_Tesla_LCCN2014684845.jpg/559px-Nicola_Tesla_LCCN2014684845.jpg
+docker exec -it openface FaceLandmarkImg -f $DATA_MOUNT/tesla.jpg -out_dir $DATA_MOUNT/tesla
+docker exec -it openface chown -R $UID:$UID $DATA_MOUNT # chown to current user
+docker-compose down # stop service if you wish
+```
--- a/download_libraries.ps1
+++ b/download_libraries.ps1
@@ -0,0 +1,127 @@
+# Download the OpenCV libraries from the cloud (stored in Dropbox and OneDrive)
+
+# ffmpeg x64 dll
+$destination = "lib/3rdParty/OpenCV/bin/opencv_ffmpeg410_64.dll"
+
+$out_dir = Join-Path (Get-Location) "lib/3rdParty/OpenCV/bin"
+if(!(Test-Path $out_dir))
+{
+	New-Item -ItemType directory -Path $out_dir
+}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{		
+	$source = "https://www.dropbox.com/s/gvkd4549wsjvn3u/opencv_ffmpeg410_64.dll?dl=1"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{
+	$source = "https://onedrive.live.com/download?cid=2E2ADA578BFF6E6E&resid=2E2ADA578BFF6E6E%2153244&authkey=AEuLAF197Sy7S3M"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+# Release x64 dll
+$destination = "lib/3rdParty/OpenCV/x64/v141/bin/Release/opencv_world410.dll"
+
+$out_dir = Join-Path (Get-Location) "lib/3rdParty/OpenCV/x64/v141/bin/Release"
+if(!(Test-Path $out_dir))
+{
+	New-Item -ItemType directory -Path $out_dir
+}
+
+#if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+#{
+#	$source = "https://www.dropbox.com/s/c81shi8br57xytv/opencv_world410.dll?dl=1"
+#	Invoke-WebRequest $source -OutFile $destination
+#}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{
+	$source = "https://onedrive.live.com/download?cid=2E2ADA578BFF6E6E&resid=2E2ADA578BFF6E6E%2153246&authkey=AJkwseAGKqL3PpU"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+# Debug x64 dll
+$destination = "lib/3rdParty/OpenCV/x64/v141/bin/Debug/opencv_world410d.dll"
+
+$out_dir = Join-Path (Get-Location) "lib/3rdParty/OpenCV/x64/v141/bin/Debug"
+if(!(Test-Path $out_dir))
+{
+	New-Item -ItemType directory -Path $out_dir
+}
+
+#if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+#{
+#	$source = "https://www.dropbox.com/s/8a4kmvpj5a09jdz/opencv_world410d.dll?dl=1"
+#	Invoke-WebRequest $source -OutFile $destination
+#}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{
+	$source = "https://onedrive.live.com/download?cid=2E2ADA578BFF6E6E&resid=2E2ADA578BFF6E6E%2153247&authkey=AJE4pLhzjtkPDWs"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+# ffmpeg x32 dll
+$destination = "lib/3rdParty/OpenCV/bin/opencv_ffmpeg410.dll"
+
+$out_dir = Join-Path (Get-Location) "lib/3rdParty/OpenCV/bin"
+if(!(Test-Path $out_dir))
+{
+	New-Item -ItemType directory -Path $out_dir
+}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{		
+	$source = "https://www.dropbox.com/s/7pada6etpui97f7/opencv_ffmpeg410.dll?dl=1"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{
+	$source = "https://onedrive.live.com/download?cid=2E2ADA578BFF6E6E&resid=2E2ADA578BFF6E6E%2153252&authkey=AEwNPWYZ7sOOhXo"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+# Release x32 dll
+$destination = "lib/3rdParty/OpenCV/x86/v141/bin/Release/opencv_world410.dll"
+
+$out_dir = Join-Path (Get-Location) "lib/3rdParty/OpenCV/x86/v141/bin/Release"
+if(!(Test-Path $out_dir))
+{
+	New-Item -ItemType directory -Path $out_dir
+}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{
+	$source = "https://www.dropbox.com/s/qf4rqphvrj2k4d1/opencv_world410.dll?dl=1"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{
+	$source = "https://onedrive.live.com/download?cid=2E2ADA578BFF6E6E&resid=2E2ADA578BFF6E6E%2153255&authkey=AJ0S3GY4fCYKFXo"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+# Debug x32 dll
+$destination = "lib/3rdParty/OpenCV/x86/v141/bin/Debug/opencv_world410d.dll"
+
+$out_dir = Join-Path (Get-Location) "lib/3rdParty/OpenCV/x86/v141/bin/Debug"
+if(!(Test-Path $out_dir))
+{
+	New-Item -ItemType directory -Path $out_dir
+}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{
+	$source = "https://www.dropbox.com/s/kafps88dbdlg5y2/opencv_world410d.dll?dl=1"
+	Invoke-WebRequest $source -OutFile $destination
+}
+
+if(!([System.IO.File]::Exists( (Join-Path (Get-Location) $destination) )))
+{
+	$source = "https://onedrive.live.com/download?cid=2E2ADA578BFF6E6E&resid=2E2ADA578BFF6E6E%2153256&authkey=ALbDTeRByHmWO-M"
+	Invoke-WebRequest $source -OutFile $destination
+}
--- a/download_models.ps1
+++ b/download_models.ps1
@@ -1,4 +1,4 @@
-# Download the models from the cloud (stored in Dropbox, OneDrive, and Google Drive)
+# Download the models from the cloud (stored in Dropbox and OneDrive)

 # Determine correct path to the model files
 if([System.IO.Directory]::Exists( (Join-Path (Get-Location) 'lib') ))
--- a/exe/FaceLandmarkImg/FaceLandmarkImg.cpp
+++ b/exe/FaceLandmarkImg/FaceLandmarkImg.cpp
@@ -52,16 +52,14 @@
 #define CONFIG_DIR "~"
 #endif

-using namespace std;
-
-vector<string> get_arguments(int argc, char **argv)
+std::vector<std::string> get_arguments(int argc, char **argv)
 {

-	vector<string> arguments;
+	std::vector<std::string> arguments;

 	for (int i = 0; i < argc; ++i)
 	{
-		arguments.push_back(string(argv[i]));
+		arguments.push_back(std::string(argv[i]));
 	}
 	return arguments;
 }
@@ -70,13 +68,13 @@ int main(int argc, char **argv)
 {

 	//Convert arguments to more convenient vector form
-	vector<string> arguments = get_arguments(argc, argv);
+	std::vector<std::string> arguments = get_arguments(argc, argv);

 	// no arguments: output usage
 	if (arguments.size() == 1)
 	{
-		cout << "For command line arguments see:" << endl;
-		cout << " https://github.com/TadasBaltrusaitis/OpenFace/wiki/Command-line-arguments";
+		std::cout << "For command line arguments see:" << std::endl;
+		std::cout << " https://github.com/TadasBaltrusaitis/OpenFace/wiki/Command-line-arguments";
 		return 0;
 	}

@@ -86,7 +84,7 @@ int main(int argc, char **argv)
 	// The sequence reader chooses what to open based on command line arguments provided
 	if (!image_reader.Open(arguments))
 	{
-		cout << "Could not open any images" << endl;
+		std::cout << "Could not open any images" << std::endl;
 		return 1;
 	}

@@ -94,16 +92,16 @@ int main(int argc, char **argv)
 	LandmarkDetector::FaceModelParameters det_parameters(arguments);

 	// The modules that are being used for tracking
-	cout << "Loading the model" << endl;
+	std::cout << "Loading the model" << std::endl;
 	LandmarkDetector::CLNF face_model(det_parameters.model_location);

 	if (!face_model.loaded_successfully)
 	{
-		cout << "ERROR: Could not load the landmark detector" << endl;
+		std::cout << "ERROR: Could not load the landmark detector" << std::endl;
 		return 1;
 	}

-	cout << "Model loaded" << endl;
+	std::cout << "Model loaded" << std::endl;

 	// Load facial feature extractor and AU analyser (make sure it is static)
 	FaceAnalysis::FaceAnalyserParameters face_analysis_params(arguments);
@@ -118,7 +116,7 @@ int main(int argc, char **argv)
 	// If can't find MTCNN face detector, default to HOG one
 	if (det_parameters.curr_face_detector == LandmarkDetector::FaceModelParameters::MTCNN_DETECTOR && face_detector_mtcnn.empty())
 	{
-		cout << "INFO: defaulting to HOG-SVM face detector" << endl;
+		std::cout << "INFO: defaulting to HOG-SVM face detector" << std::endl;
 		det_parameters.curr_face_detector = LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR;
 	}

@@ -131,15 +129,15 @@ int main(int argc, char **argv)
 	
 	if (!face_model.eye_model)
 	{
-		cout << "WARNING: no eye model found" << endl;
+		std::cout << "WARNING: no eye model found" << std::endl;
 	}

 	if (face_analyser.GetAUClassNames().size() == 0 && face_analyser.GetAUClassNames().size() == 0)
 	{
-		cout << "WARNING: no Action Unit models found" << endl;
+		std::cout << "WARNING: no Action Unit models found" << std::endl;
 	}

-	cout << "Starting tracking" << endl;
+	std::cout << "Starting tracking" << std::endl;
 	while (!rgb_image.empty())
 	{
 	
@@ -158,7 +156,7 @@ int main(int argc, char **argv)
 		cv::Mat_<uchar> grayscale_image = image_reader.GetGrayFrame();

 		// Detect faces in an image
-		vector<cv::Rect_<float> > face_detections;
+		std::vector<cv::Rect_<float> > face_detections;

 		if (image_reader.has_bounding_boxes)
 		{
@@ -168,7 +166,7 @@ int main(int argc, char **argv)
 		{
 			if (det_parameters.curr_face_detector == LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR)
 			{
-				vector<float> confidences;
+				std::vector<float> confidences;
 				LandmarkDetector::DetectFacesHOG(face_detections, grayscale_image, face_detector_hog, confidences);
 			}
 			else if (det_parameters.curr_face_detector == LandmarkDetector::FaceModelParameters::HAAR_DETECTOR)
@@ -177,7 +175,7 @@ int main(int argc, char **argv)
 			}
 			else
 			{
-				vector<float> confidences;
+				std::vector<float> confidences;
 				LandmarkDetector::DetectFacesMTCNN(face_detections, rgb_image, face_detector_mtcnn, confidences);
 			}
 		}
--- a/exe/FaceLandmarkImg/FaceLandmarkImg.vcxproj
+++ b/exe/FaceLandmarkImg/FaceLandmarkImg.vcxproj
@@ -23,34 +23,34 @@
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>FaceLandmarkImg</RootNamespace>
    <ProjectName>FaceLandmarkImg</ProjectName>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0.17134.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
@@ -58,29 +58,25 @@
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_x86.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_64.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_x86.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_64.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
@@ -111,6 +107,7 @@
      <AdditionalIncludeDirectories>$(SolutionDir)\lib\local\LandmarkDetector\include;$(SolutionDir)\lib\local\FaceAnalyser\include;$(SolutionDir)\lib\local\GazeAnalyser\include;$(SolutionDir)\lib\local\Utilities\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -127,6 +124,7 @@
      <EnableEnhancedInstructionSet>
      </EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -147,6 +145,7 @@
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -170,6 +169,7 @@
      </EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp
+++ b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp
@@ -59,16 +59,14 @@ static void printErrorAndAbort(const std::string & error)
 #define FATAL_STREAM( stream ) \
 printErrorAndAbort( std::string( "Fatal error: " ) + stream )

-using namespace std;
-
-vector<string> get_arguments(int argc, char **argv)
+std::vector<std::string> get_arguments(int argc, char **argv)
 {

-	vector<string> arguments;
+	std::vector<std::string> arguments;

 	for (int i = 0; i < argc; ++i)
 	{
-		arguments.push_back(string(argv[i]));
+		arguments.push_back(std::string(argv[i]));
 	}
 	return arguments;
 }
@@ -76,13 +74,13 @@ vector<string> get_arguments(int argc, char **argv)
 int main(int argc, char **argv)
 {

-	vector<string> arguments = get_arguments(argc, argv);
+	std::vector<std::string> arguments = get_arguments(argc, argv);

 	// no arguments: output usage
 	if (arguments.size() == 1)
 	{
-		cout << "For command line arguments see:" << endl;
-		cout << " https://github.com/TadasBaltrusaitis/OpenFace/wiki/Command-line-arguments";
+		std::cout << "For command line arguments see:" << std::endl;
+		std::cout << " https://github.com/TadasBaltrusaitis/OpenFace/wiki/Command-line-arguments";
 		return 0;
 	}

@@ -92,13 +90,13 @@ int main(int argc, char **argv)
 	LandmarkDetector::CLNF face_model(det_parameters.model_location);
 	if (!face_model.loaded_successfully)
 	{
-		cout << "ERROR: Could not load the landmark detector" << endl;
+		std::cout << "ERROR: Could not load the landmark detector" << std::endl;
 		return 1;
 	}

 	if (!face_model.eye_model)
 	{
-		cout << "WARNING: no eye model found" << endl;
+		std::cout << "WARNING: no eye model found" << std::endl;
 	}

 	// Open a sequence
--- a/exe/FaceLandmarkVid/FaceLandmarkVid.vcxproj
+++ b/exe/FaceLandmarkVid/FaceLandmarkVid.vcxproj
@@ -23,34 +23,34 @@
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>FaceLandmarkVid</RootNamespace>
    <ProjectName>FaceLandmarkVid</ProjectName>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0.17134.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
@@ -58,29 +58,25 @@
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_x86.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_64.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_x86.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_64.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
@@ -112,6 +108,7 @@
      <OpenMPSupport>false</OpenMPSupport>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -129,6 +126,7 @@
      <EnableEnhancedInstructionSet>
      </EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -149,6 +147,7 @@
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -172,6 +171,7 @@
      <EnableEnhancedInstructionSet>
      </EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp
+++ b/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp
@@ -62,21 +62,55 @@ static void printErrorAndAbort(const std::string & error)
 #define FATAL_STREAM( stream ) \
 printErrorAndAbort( std::string( "Fatal error: " ) + stream )

-using namespace std;
-
-vector<string> get_arguments(int argc, char **argv)
+std::vector<std::string> get_arguments(int argc, char **argv)
 {

-	vector<string> arguments;
+	std::vector<std::string> arguments;

 	for (int i = 0; i < argc; ++i)
 	{
-		arguments.push_back(string(argv[i]));
+		arguments.push_back(std::string(argv[i]));
 	}
 	return arguments;
 }

-void NonOverlapingDetections(const vector<LandmarkDetector::CLNF>& clnf_models, vector<cv::Rect_<float> >& face_detections)
+double IOU(cv::Rect_<float> rect1, cv::Rect_<float> rect2)
+{
+	double intersection_area = (rect1 & rect2).area();
+	double union_area = rect1.area() + rect2.area() - intersection_area;
+	return intersection_area / union_area;
+}
+
+void RemoveOverlapingModels(std::vector<LandmarkDetector::CLNF>& face_models, std::vector<bool>& active_models)
+{
+	// Go over the model and eliminate detections that are not informative (there already is a tracker there)
+	for (size_t model1 = 0; model1 < active_models.size(); ++model1)
+	{
+		if (active_models[model1])
+		{
+			// See if the detections intersect
+			cv::Rect_<float> model1_rect = face_models[model1].GetBoundingBox();
+
+			for (int model2 = model1 + 1; model2 < active_models.size(); ++model2)
+			{
+				if(active_models[model2])
+				{
+					cv::Rect_<float> model2_rect = face_models[model2].GetBoundingBox();
+
+					// If the model is already tracking what we're detecting ignore the detection, this is determined by amount of overlap
+					if (IOU(model1_rect, model2_rect) > 0.5)
+					{
+						active_models[model1] = false;
+						face_models[model1].Reset();
+					}
+				}
+			}
+		}
+	}
+
+}
+
+void NonOverlapingDetections(const std::vector<LandmarkDetector::CLNF>& clnf_models, std::vector<cv::Rect_<float> >& face_detections)
 {

 	// Go over the model and eliminate detections that are not informative (there already is a tracker there)
@@ -88,11 +122,8 @@ void NonOverlapingDetections(const vector<LandmarkDetector::CLNF>& clnf_models,

 		for (int detection = face_detections.size() - 1; detection >= 0; --detection)
 		{
-			double intersection_area = (model_rect & face_detections[detection]).area();
-			double union_area = model_rect.area() + face_detections[detection].area() - 2 * intersection_area;
-
 			// If the model is already tracking what we're detecting ignore the detection, this is determined by amount of overlap
-			if (intersection_area / union_area > 0.5)
+			if (IOU(model_rect, face_detections[detection]) > 0.5)
 			{
 				face_detections.erase(face_detections.begin() + detection);
 			}
@@ -103,13 +134,13 @@ void NonOverlapingDetections(const vector<LandmarkDetector::CLNF>& clnf_models,
 int main(int argc, char **argv)
 {

-	vector<string> arguments = get_arguments(argc, argv);
+	std::vector<std::string> arguments = get_arguments(argc, argv);

 	// no arguments: output usage
 	if (arguments.size() == 1)
 	{
-		cout << "For command line arguments see:" << endl;
-		cout << " https://github.com/TadasBaltrusaitis/OpenFace/wiki/Command-line-arguments";
+		std::cout << "For command line arguments see:" << std::endl;
+		std::cout << " https://github.com/TadasBaltrusaitis/OpenFace/wiki/Command-line-arguments";
 		return 0;
 	}

@@ -119,12 +150,12 @@ int main(int argc, char **argv)

 	det_params.curr_face_detector = LandmarkDetector::FaceModelParameters::MTCNN_DETECTOR;

-	vector<LandmarkDetector::FaceModelParameters> det_parameters;
+	std::vector<LandmarkDetector::FaceModelParameters> det_parameters;
 	det_parameters.push_back(det_params);

 	// The modules that are being used for tracking
-	vector<LandmarkDetector::CLNF> face_models;
-	vector<bool> active_models;
+	std::vector<LandmarkDetector::CLNF> face_models;
+	std::vector<bool> active_models;

 	int num_faces_max = 4;

@@ -132,7 +163,7 @@ int main(int argc, char **argv)

 	if (!face_model.loaded_successfully)
 	{
-		cout << "ERROR: Could not load the landmark detector" << endl;
+		std::cout << "ERROR: Could not load the landmark detector" << std::endl;
 		return 1;
 	}

@@ -145,7 +176,7 @@ int main(int argc, char **argv)
 	// If can't find MTCNN face detector, default to HOG one
 	if (det_parameters[0].curr_face_detector == LandmarkDetector::FaceModelParameters::MTCNN_DETECTOR && face_model.face_detector_MTCNN.empty())
 	{
-		cout << "INFO: defaulting to HOG-SVM face detector" << endl;
+		std::cout << "INFO: defaulting to HOG-SVM face detector" << std::endl;
 		det_parameters[0].curr_face_detector = LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR;
 	}

@@ -168,12 +199,12 @@ int main(int argc, char **argv)

 	if (!face_model.eye_model)
 	{
-		cout << "WARNING: no eye model found" << endl;
+		std::cout << "WARNING: no eye model found" << std::endl;
 	}

 	if (face_analyser.GetAUClassNames().size() == 0 && face_analyser.GetAUClassNames().size() == 0)
 	{
-		cout << "WARNING: no Action Unit models found" << endl;
+		std::cout << "WARNING: no Action Unit models found" << std::endl;
 	}

 	// Open a sequence
@@ -232,7 +263,7 @@ int main(int argc, char **argv)
 			// Reading the images
 			cv::Mat_<uchar> grayscale_image = sequence_reader.GetGrayFrame();

-			vector<cv::Rect_<float> > face_detections;
+			std::vector<cv::Rect_<float> > face_detections;

 			bool all_models_active = true;
 			for (unsigned int model = 0; model < face_models.size(); ++model)
@@ -248,7 +279,7 @@ int main(int argc, char **argv)
 			{
 				if (det_parameters[0].curr_face_detector == LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR)
 				{
-					vector<float> confidences;
+					std::vector<float> confidences;
 					LandmarkDetector::DetectFacesHOG(face_detections, grayscale_image, face_models[0].face_detector_HOG, confidences);
 				}
 				else if (det_parameters[0].curr_face_detector == LandmarkDetector::FaceModelParameters::HAAR_DETECTOR)
@@ -257,7 +288,7 @@ int main(int argc, char **argv)
 				}
 				else
 				{
-					vector<float> confidences;
+					std::vector<float> confidences;
 					LandmarkDetector::DetectFacesMTCNN(face_detections, rgb_image, face_models[0].face_detector_MTCNN, confidences);
 				}

@@ -314,6 +345,10 @@ int main(int argc, char **argv)
 				}
 			}

+			// Remove models that end up tracking overlapping faces
+			// even if initial bounding boxes were not overlapping, they could have ended up converging to the same face
+			RemoveOverlapingModels(face_models, active_models);
+
 			// Keeping track of FPS
 			fps_tracker.AddFrame();

@@ -402,10 +437,10 @@ int main(int argc, char **argv)
 			// Reporting progress
 			if (sequence_reader.GetProgress() >= reported_completion / 10.0)
 			{
-				cout << reported_completion * 10 << "% ";
+				std::cout << reported_completion * 10 << "% ";
 				if (reported_completion == 10)
 				{
-					cout << endl;
+					std::cout << std::endl;
 				}
 				reported_completion = reported_completion + 1;
 			}
--- a/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.vcxproj
+++ b/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.vcxproj
@@ -22,32 +22,32 @@
    <ProjectGuid>{C3FAF36F-44BC-4454-87C2-C5106575FE50}</ProjectGuid>
    <RootNamespace>FaceLandmarkVidMulti</RootNamespace>
    <ProjectName>FaceLandmarkVidMulti</ProjectName>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0.17134.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
@@ -57,29 +57,25 @@
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_x86.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_64.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_x86.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_64.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
@@ -104,6 +100,7 @@
      <AdditionalIncludeDirectories>$(SolutionDir)\lib\local\LandmarkDetector\include;$(SolutionDir)\lib\local\Utilities\include;$(SolutionDir)\lib\local\FaceAnalyser\include;$(SolutionDir)\lib\local\GazeAnalyser\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -117,6 +114,7 @@
      <EnableEnhancedInstructionSet>
      </EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -134,6 +132,7 @@
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -155,6 +154,7 @@
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <PreprocessorDefinitions>WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
--- a/exe/FeatureExtraction/FeatureExtraction.cpp
+++ b/exe/FeatureExtraction/FeatureExtraction.cpp
@@ -68,17 +68,15 @@ static void printErrorAndAbort(const std::string & error)
 #define FATAL_STREAM( stream ) \
 printErrorAndAbort( std::string( "Fatal error: " ) + stream )

-using namespace std;
-
-vector<string> get_arguments(int argc, char **argv)
+std::vector<std::string> get_arguments(int argc, char **argv)
 {

-	vector<string> arguments;
+	std::vector<std::string> arguments;

 	// First argument is reserved for the name of the executable
 	for (int i = 0; i < argc; ++i)
 	{
-		arguments.push_back(string(argv[i]));
+		arguments.push_back(std::string(argv[i]));
 	}
 	return arguments;
 }
@@ -86,13 +84,13 @@ vector<string> get_arguments(int argc, char **argv)
 int main(int argc, char **argv)
 {

-	vector<string> arguments = get_arguments(argc, argv);
+	std::vector<std::string> arguments = get_arguments(argc, argv);

 	// no arguments: output usage
 	if (arguments.size() == 1)
 	{
-		cout << "For command line arguments see:" << endl;
-		cout << " https://github.com/TadasBaltrusaitis/OpenFace/wiki/Command-line-arguments";
+		std::cout << "For command line arguments see:" << std::endl;
+		std::cout << " https://github.com/TadasBaltrusaitis/OpenFace/wiki/Command-line-arguments";
 		return 0;
 	}

@@ -104,7 +102,7 @@ int main(int argc, char **argv)

 	if (!face_model.loaded_successfully)
 	{
-		cout << "ERROR: Could not load the landmark detector" << endl;
+		std::cout << "ERROR: Could not load the landmark detector" << std::endl;
 		return 1;
 	}

@@ -114,12 +112,12 @@ int main(int argc, char **argv)

 	if (!face_model.eye_model)
 	{
-		cout << "WARNING: no eye model found" << endl;
+		std::cout << "WARNING: no eye model found" << std::endl;
 	}

 	if (face_analyser.GetAUClassNames().size() == 0 && face_analyser.GetAUClassNames().size() == 0)
 	{
-		cout << "WARNING: no Action Unit models found" << endl;
+		std::cout << "WARNING: no Action Unit models found" << std::endl;
 	}

 	Utilities::SequenceCapture sequence_reader;
@@ -158,7 +156,7 @@ int main(int argc, char **argv)
 		Utilities::RecorderOpenFace open_face_rec(sequence_reader.name, recording_params, arguments);

 		if (recording_params.outputGaze() && !face_model.eye_model)
-			cout << "WARNING: no eye model defined, but outputting gaze" << endl;
+			std::cout << "WARNING: no eye model defined, but outputting gaze" << std::endl;

 		captured_image = sequence_reader.GetNextFrame();

@@ -240,10 +238,10 @@ int main(int argc, char **argv)
 			// Reporting progress
 			if (sequence_reader.GetProgress() >= reported_completion / 10.0)
 			{
-				cout << reported_completion * 10 << "% ";
+				std::cout << reported_completion * 10 << "% ";
 				if (reported_completion == 10)
 				{
-					cout << endl;
+					std::cout << std::endl;
 				}
 				reported_completion = reported_completion + 1;
 			}
--- a/exe/FeatureExtraction/FeatureExtraction.vcxproj
+++ b/exe/FeatureExtraction/FeatureExtraction.vcxproj
@@ -22,34 +22,34 @@
    <ProjectGuid>{8A23C00D-767D-422D-89A3-CF225E3DAB4B}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>FeatureExtraction</RootNamespace>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0.17134.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
@@ -57,29 +57,25 @@
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_x86.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_64.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_x86.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
    <Import Project="..\..\lib\3rdParty\dlib\dlib.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
    <Import Project="..\..\lib\3rdParty\OpenBLAS\OpenBLAS_64.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
@@ -111,6 +107,7 @@
      <OpenMPSupport>false</OpenMPSupport>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -128,6 +125,7 @@
      <EnableEnhancedInstructionSet>
      </EnableEnhancedInstructionSet>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -149,6 +147,7 @@
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -173,6 +172,7 @@
      </EnableEnhancedInstructionSet>
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/exe/Recording/Record.cpp
+++ b/exe/Recording/Record.cpp
@@ -43,14 +43,11 @@

 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
-#include <opencv2/videoio/videoio.hpp>  // Video write
-#include <opencv2/videoio/videoio_c.h>  // Video write

 #include <stdio.h>
 #include <time.h>

-#include <filesystem.hpp>
-#include <filesystem/fstream.hpp>
+#include <filesystem>

 #define INFO_STREAM( stream ) \
 std::cout << stream << std::endl
@@ -70,8 +67,6 @@ static void printErrorAndAbort( const std::string & error )
 #define FATAL_STREAM( stream ) \
 printErrorAndAbort( std::string( "Fatal error: " ) + stream )

-using namespace std;
-
 // Get current date/time, format is YYYY-MM-DD.HH:mm:ss
 const std::string currentDateTime() {
    time_t     now = time(0);
@@ -85,14 +80,14 @@ const std::string currentDateTime() {
    return buf;
 }

-vector<string> get_arguments(int argc, char **argv)
+std::vector<std::string> get_arguments(int argc, char **argv)
 {

-	vector<string> arguments;
+	std::vector<std::string> arguments;

 	for(int i = 1; i < argc; ++i)
 	{
-		arguments.push_back(string(argv[i]));
+		arguments.push_back(std::string(argv[i]));
 	}
 	return arguments;
 }
@@ -100,10 +95,10 @@ vector<string> get_arguments(int argc, char **argv)
 int main (int argc, char **argv)
 {

-	vector<string> arguments = get_arguments(argc, argv);
+	std::vector<std::string> arguments = get_arguments(argc, argv);

 	// Some initial parameters that can be overriden from command line	
-	string outroot, outfile;
+	std::string outroot, outfile;

 	TCHAR NPath[200];
 	GetCurrentDirectory(200, NPath);
@@ -152,16 +147,16 @@ int main (int argc, char **argv)
 	cv::Mat img;
 	vCap >> img;
 			
-	boost::filesystem::path dir(outroot);
-	boost::filesystem::create_directory(dir);
+	std::filesystem::path dir(outroot);
+	std::filesystem::create_directory(dir);

-	string out_file = outroot + outfile;
+	std::string out_file = outroot + outfile;
 	// saving the videos
-	cv::VideoWriter video_writer(out_file, CV_FOURCC('D','I','V','X'), 30, img.size(), true);
+	cv::VideoWriter video_writer(out_file, cv::VideoWriter::fourcc('D','I','V','X'), 30, img.size(), true);

-	ofstream outlog;
-	outlog.open((outroot + outfile + ".log").c_str(), ios_base::out);
-	outlog << "frame, time(ms)" << endl;
+	std::ofstream outlog;
+	outlog.open((outroot + outfile + ".log").c_str(), std::ios_base::out);
+	outlog << "frame, time(ms)" << std::endl;

 	double freq = cv::getTickFrequency();

@@ -180,7 +175,7 @@ int main (int argc, char **argv)
 		video_writer << img;

 		outlog << frameProc + 1 << " " << curr_time;
-		outlog << endl;
+		outlog << std::endl;
 						
 		
 		cv::imshow("rec", img);
--- a/exe/Recording/Recording.vcxproj
+++ b/exe/Recording/Recording.vcxproj
@@ -21,57 +21,53 @@
  <PropertyGroup Label="Globals">
    <ProjectGuid>{2D80FA0B-2DE8-4475-BA5A-C08A9E1EDAAC}</ProjectGuid>
    <RootNamespace>Recording</RootNamespace>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0.17134.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost_d.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="..\..\lib\3rdParty\boost\boost.props" />
-    <Import Project="..\..\lib\3rdParty\OpenCV3.4\openCV3.4.props" />
+    <Import Project="..\..\lib\3rdParty\OpenCV\openCV.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
@@ -86,6 +82,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -96,6 +93,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -110,6 +108,7 @@
      <OpenMPSupport>false</OpenMPSupport>
      <CallingConvention>Cdecl</CallingConvention>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -128,6 +127,7 @@
      <OpenMPSupport>false</OpenMPSupport>
      <CallingConvention>Cdecl</CallingConvention>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
--- a/exe/releases/package_windows_executables.m
+++ b/exe/releases/package_windows_executables.m
@@ -1,5 +1,5 @@
 clear;
-version = '2.1.0';
+version = '2.2.0';

 out_x86 = sprintf('OpenFace_%s_win_x86', version);
 out_x64 = sprintf('OpenFace_%s_win_x64', version);
--- a/gui/OpenFaceOffline/OpenFaceOffline.csproj
+++ b/gui/OpenFaceOffline/OpenFaceOffline.csproj
@@ -68,6 +68,7 @@
    </Reference>
    <Reference Include="System" />
    <Reference Include="System.Data" />
+    <Reference Include="System.Drawing" />
    <Reference Include="System.Windows.Forms" />
    <Reference Include="System.Xml" />
    <Reference Include="Microsoft.CSharp" />
--- a/install.sh
+++ b/install.sh
@@ -1,16 +1,13 @@
 #!/bin/bash
 #==============================================================================
 # Title: install.sh
-# Description: Install everything necessary for OpenFace to compile.
-# Author: Daniyal Shahrokhian <daniyal@kth.se>
-# Date: 20170428
-# Version : 1.02
+# Description: Install everything necessary for OpenFace to compile. 
+# Will install all required dependencies, only use if you do not have the dependencies
+# already installed or if you don't mind specific versions of gcc,g++,cmake,opencv etc. installed
+# Author: Daniyal Shahrokhian <daniyal@kth.se>, Tadas Baltrusaitis <tadyla@gmail.com>
+# Date: 20190630
+# Version : 1.03
 # Usage: bash install.sh
-# NOTES: There are certain steps to be taken in the system before installing 
-#        via this script (refer to README): Run 
-#        `sudo gedit /etc/apt/sources.list` and change the line 
-#        `deb http://us.archive.ubuntu.com/ubuntu/ xenial main restricted` to 
-#        `deb http://us.archive.ubuntu.com/ubuntu/ xenial main universe`
 #==============================================================================

 # Exit script if any command fails
@@ -25,19 +22,49 @@ fi

 # Essential Dependencies
 echo "Installing Essential dependencies..."
+
+# If we're not on 18.04
 sudo apt-get -y update
+
+if [[ `lsb_release -rs` != "18.04" ]]
+  then   
+    echo "Adding ppa:ubuntu-toolchain-r/test apt-repository "
+    sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+    sudo apt-get -y update
+fi
+
 sudo apt-get -y install build-essential
-sudo apt-get -y install cmake
+sudo apt-get -y install gcc-8 g++-8
+
+# Ubuntu 16.04 does not have newest CMake so need to build it manually
+if [[ `lsb_release -rs` != "18.04" ]]; then   
+  sudo apt-get --purge remove cmake-qt-gui -y
+  sudo apt-get --purge remove cmake -y
+  mkdir -p cmake_tmp
+  cd cmake_tmp
+  wget https://cmake.org/files/v3.10/cmake-3.10.1.tar.gz
+  tar -xzvf cmake-3.10.1.tar.gz
+  cd cmake-3.10.1/
+  ./bootstrap
+  make -j4
+  sudo make install
+  cd ../..
+  sudo rm -r cmake_tmp
+else
+  sudo apt-get -y install cmake
+fi
+
+sudo apt-get -y install zip
 sudo apt-get -y install libopenblas-dev liblapack-dev
-sudo apt-get -y install git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev
-sudo apt-get -y install python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev libdc1394-22-dev
+sudo apt-get -y install libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev
+sudo apt-get -y install libtbb2 libjpeg-dev libpng-dev libtiff-dev libdc1394-22-dev
 echo "Essential dependencies installed."

 # OpenCV Dependency
 echo "Downloading OpenCV..."
-wget https://github.com/opencv/opencv/archive/3.4.0.zip
-unzip 3.4.0.zip
-cd opencv-3.4.0
+wget https://github.com/opencv/opencv/archive/4.1.0.zip
+unzip 4.1.0.zip
+cd opencv-4.1.0
 mkdir -p build
 cd build
 echo "Installing OpenCV..."
@@ -45,8 +72,8 @@ cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local -D WITH_TBB
 make -j4
 sudo make install
 cd ../..
-rm 3.4.0.zip
-sudo rm -r opencv-3.4.0
+rm 4.1.0.zip
+sudo rm -r opencv-4.1.0
 echo "OpenCV installed."

 # dlib dependecy
@@ -65,16 +92,11 @@ cd ../..;
 rm -r dlib-19.13.tar.bz2
 echo "dlib installed"

-# Boost C++ Dependency
-echo "Installing Boost..."
-sudo apt-get install libboost-all-dev
-echo "Boost installed."
-
 # OpenFace installation
 echo "Installing OpenFace..."
 mkdir -p build
 cd build
-cmake -D CMAKE_BUILD_TYPE=RELEASE ..
+cmake -D CMAKE_CXX_COMPILER=g++-8 -D CMAKE_C_COMPILER=gcc-8 -D CMAKE_BUILD_TYPE=RELEASE ..
 make
 cd ..
 echo "OpenFace successfully installed."
--- a/lib/3rdParty/CameraEnumerator/CameraEnumerator.vcxproj
+++ b/lib/3rdParty/CameraEnumerator/CameraEnumerator.vcxproj
@@ -22,32 +22,32 @@
    <ProjectGuid>{50B7D4BF-E33B-41D0-AA89-76BBA57BF5CC}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>CameraEnumerator</RootNamespace>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0.17134.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
@@ -78,6 +78,7 @@
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
@@ -92,6 +93,7 @@
      <PreprocessorDefinitions>_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <EnableEnhancedInstructionSet>
      </EnableEnhancedInstructionSet>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
@@ -108,6 +110,7 @@
      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
@@ -127,6 +130,7 @@
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <EnableEnhancedInstructionSet>
      </EnableEnhancedInstructionSet>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
--- a/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_frontalface_alt.xml
+++ b/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_frontalface_alt.xml
--- a/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_frontalface_alt2.xml
+++ b/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_frontalface_alt2.xml
--- a/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_frontalface_alt_tree.xml
+++ b/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_frontalface_alt_tree.xml
--- a/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_frontalface_default.xml
+++ b/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_frontalface_default.xml
--- a/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_profileface.xml
+++ b/lib/3rdParty/OpenCV3.4/classifiers/haarcascade_profileface.xml
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/calib3d.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/calib3d.hpp
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/calib3d/calib3d.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/calib3d/calib3d.hpp
--- a/lib/3rdParty/OpenCV/include/opencv2/calib3d/calib3d_c.h
+++ b/lib/3rdParty/OpenCV/include/opencv2/calib3d/calib3d_c.h
@@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CALIB3D_C_H
+#define OPENCV_CALIB3D_C_H
+
+#include "opencv2/core/types_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Calculates fundamental matrix given a set of corresponding points */
+#define CV_FM_7POINT 1
+#define CV_FM_8POINT 2
+
+#define CV_LMEDS 4
+#define CV_RANSAC 8
+
+#define CV_FM_LMEDS_ONLY  CV_LMEDS
+#define CV_FM_RANSAC_ONLY CV_RANSAC
+#define CV_FM_LMEDS CV_LMEDS
+#define CV_FM_RANSAC CV_RANSAC
+
+enum
+{
+    CV_ITERATIVE = 0,
+    CV_EPNP = 1, // F.Moreno-Noguer, V.Lepetit and P.Fua "EPnP: Efficient Perspective-n-Point Camera Pose Estimation"
+    CV_P3P = 2, // X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang; "Complete Solution Classification for the Perspective-Three-Point Problem"
+    CV_DLS = 3 // Joel A. Hesch and Stergios I. Roumeliotis. "A Direct Least-Squares (DLS) Method for PnP"
+};
+
+#define CV_CALIB_CB_ADAPTIVE_THRESH  1
+#define CV_CALIB_CB_NORMALIZE_IMAGE  2
+#define CV_CALIB_CB_FILTER_QUADS     4
+#define CV_CALIB_CB_FAST_CHECK       8
+
+#define CV_CALIB_USE_INTRINSIC_GUESS  1
+#define CV_CALIB_FIX_ASPECT_RATIO     2
+#define CV_CALIB_FIX_PRINCIPAL_POINT  4
+#define CV_CALIB_ZERO_TANGENT_DIST    8
+#define CV_CALIB_FIX_FOCAL_LENGTH 16
+#define CV_CALIB_FIX_K1  32
+#define CV_CALIB_FIX_K2  64
+#define CV_CALIB_FIX_K3  128
+#define CV_CALIB_FIX_K4  2048
+#define CV_CALIB_FIX_K5  4096
+#define CV_CALIB_FIX_K6  8192
+#define CV_CALIB_RATIONAL_MODEL 16384
+#define CV_CALIB_THIN_PRISM_MODEL 32768
+#define CV_CALIB_FIX_S1_S2_S3_S4  65536
+#define CV_CALIB_TILTED_MODEL  262144
+#define CV_CALIB_FIX_TAUX_TAUY  524288
+#define CV_CALIB_FIX_TANGENT_DIST 2097152
+
+#define CV_CALIB_NINTRINSIC 18
+
+#define CV_CALIB_FIX_INTRINSIC  256
+#define CV_CALIB_SAME_FOCAL_LENGTH 512
+
+#define CV_CALIB_ZERO_DISPARITY 1024
+
+/* stereo correspondence parameters and functions */
+#define CV_STEREO_BM_NORMALIZED_RESPONSE  0
+#define CV_STEREO_BM_XSOBEL               1
+
+#ifdef __cplusplus
+} // extern "C"
+
+//////////////////////////////////////////////////////////////////////////////////////////
+class CV_EXPORTS CvLevMarq
+{
+public:
+    CvLevMarq();
+    CvLevMarq( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    ~CvLevMarq();
+    void init( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    bool update( const CvMat*& param, CvMat*& J, CvMat*& err );
+    bool updateAlt( const CvMat*& param, CvMat*& JtJ, CvMat*& JtErr, double*& errNorm );
+
+    void clear();
+    void step();
+    enum { DONE=0, STARTED=1, CALC_J=2, CHECK_ERR=3 };
+
+    cv::Ptr<CvMat> mask;
+    cv::Ptr<CvMat> prevParam;
+    cv::Ptr<CvMat> param;
+    cv::Ptr<CvMat> J;
+    cv::Ptr<CvMat> err;
+    cv::Ptr<CvMat> JtJ;
+    cv::Ptr<CvMat> JtJN;
+    cv::Ptr<CvMat> JtErr;
+    cv::Ptr<CvMat> JtJV;
+    cv::Ptr<CvMat> JtJW;
+    double prevErrNorm, errNorm;
+    int lambdaLg10;
+    CvTermCriteria criteria;
+    int state;
+    int iters;
+    bool completeSymmFlag;
+    int solveMethod;
+};
+
+#endif
+
+#endif /* OPENCV_CALIB3D_C_H */
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core.hpp
@@ -75,6 +75,7 @@
        @defgroup core_utils_sse SSE utilities
        @defgroup core_utils_neon NEON utilities
        @defgroup core_utils_softfloat Softfloat support
+        @defgroup core_utils_samples Utility functions for OpenCV samples
    @}
    @defgroup core_opengl OpenGL interoperability
    @defgroup core_ipp Intel IPP Asynchronous C/C++ Converters
@@ -91,6 +92,7 @@
        @{
            @defgroup core_hal_intrin_impl Private implementation helpers
        @}
+        @defgroup core_lowlevel_api Low-level API for external libraries / plugins
    @}
@}
 */
@@ -124,7 +126,7 @@ public:
    /*!
     \return the error description and the context as a text string.
    */
-    virtual const char *what() const throw();
+    virtual const char *what() const throw() CV_OVERRIDE;
    void formatMessage();

    String msg; ///< the formatted error message
@@ -140,11 +142,11 @@ public:

 By default the function prints information about the error to stderr,
 then it either stops if cv::setBreakOnError() had been called before or raises the exception.
-It is possible to alternate error processing by using cv::redirectError().
+It is possible to alternate error processing by using #redirectError().
@param exc the exception raisen.
@deprecated drop this version
 */
-CV_EXPORTS void error( const Exception& exc );
+CV_EXPORTS CV_NORETURN void error(const Exception& exc);

 enum SortFlags { SORT_EVERY_ROW    = 0, //!< each matrix row is sorted independently
                 SORT_EVERY_COLUMN = 1, //!< each matrix column is sorted
@@ -175,7 +177,7 @@ enum CovarFlags {
    /**The output covariance matrix is calculated as:
        \f[\texttt{scale}   \cdot  [  \texttt{vects}  [0]-  \texttt{mean}  , \texttt{vects}  [1]-  \texttt{mean}  ,...]  \cdot  [ \texttt{vects}  [0]- \texttt{mean}  , \texttt{vects}  [1]- \texttt{mean}  ,...]^T,\f]
        covar will be a square matrix of the same size as the total number of elements in each input
-        vector. One and only one of COVAR_SCRAMBLED and COVAR_NORMAL must be specified.*/
+        vector. One and only one of #COVAR_SCRAMBLED and #COVAR_NORMAL must be specified.*/
    COVAR_NORMAL    = 1,
    /** If the flag is specified, the function does not calculate mean from
        the input vectors but, instead, uses the passed mean vector. This is useful if mean has been
@@ -211,28 +213,6 @@ enum KmeansFlags {
    KMEANS_USE_INITIAL_LABELS = 1
 };

-//! type of line
-enum LineTypes {
-    FILLED  = -1,
-    LINE_4  = 4, //!< 4-connected line
-    LINE_8  = 8, //!< 8-connected line
-    LINE_AA = 16 //!< antialiased line
-};
-
-//! Only a subset of Hershey fonts
-//! <http://sources.isc.org/utils/misc/hershey-font.txt> are supported
-enum HersheyFonts {
-    FONT_HERSHEY_SIMPLEX        = 0, //!< normal size sans-serif font
-    FONT_HERSHEY_PLAIN          = 1, //!< small size sans-serif font
-    FONT_HERSHEY_DUPLEX         = 2, //!< normal size sans-serif font (more complex than FONT_HERSHEY_SIMPLEX)
-    FONT_HERSHEY_COMPLEX        = 3, //!< normal size serif font
-    FONT_HERSHEY_TRIPLEX        = 4, //!< normal size serif font (more complex than FONT_HERSHEY_COMPLEX)
-    FONT_HERSHEY_COMPLEX_SMALL  = 5, //!< smaller version of FONT_HERSHEY_COMPLEX
-    FONT_HERSHEY_SCRIPT_SIMPLEX = 6, //!< hand-writing style font
-    FONT_HERSHEY_SCRIPT_COMPLEX = 7, //!< more complex variant of FONT_HERSHEY_SCRIPT_SIMPLEX
-    FONT_ITALIC                 = 16 //!< flag for italic font
-};
-
 enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/columns of the matrix.
                   REDUCE_AVG = 1, //!< the output is the mean vector of all rows/columns of the matrix.
                   REDUCE_MAX = 2, //!< the output is the maximum (column/row-wise) of all rows/columns of the matrix.
@@ -266,17 +246,19 @@ Normally, the function is not called directly. It is used inside filtering funct
 copyMakeBorder.
@param p 0-based coordinate of the extrapolated pixel along one of the axes, likely \<0 or \>= len
@param len Length of the array along the corresponding axis.
-@param borderType Border type, one of the cv::BorderTypes, except for cv::BORDER_TRANSPARENT and
-cv::BORDER_ISOLATED . When borderType==cv::BORDER_CONSTANT , the function always returns -1, regardless
+@param borderType Border type, one of the #BorderTypes, except for #BORDER_TRANSPARENT and
+#BORDER_ISOLATED . When borderType==#BORDER_CONSTANT , the function always returns -1, regardless
 of p and len.

@sa copyMakeBorder
 */
 CV_EXPORTS_W int borderInterpolate(int p, int len, int borderType);

-/** @example copyMakeBorder_demo.cpp
-An example using copyMakeBorder function
- */
+/** @example samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
+An example using copyMakeBorder function.
+Check @ref tutorial_copyMakeBorder "the corresponding tutorial" for more details
+*/
+
 /** @brief Forms a border around an image.

 The function copies the source image into the middle of the destination image. The areas to the
@@ -304,7 +286,7 @@ function does not copy src itself but simply constructs the border, for example:
@endcode
@note When the source image is a part (ROI) of a bigger image, the function will try to use the
 pixels outside of the ROI to form a border. To disable this feature and always do extrapolation, as
-if src was not a ROI, use borderType | BORDER_ISOLATED.
+if src was not a ROI, use borderType | #BORDER_ISOLATED.

@param src Source image.
@param dst Destination image of the same type as src and the size Size(src.cols+left+right,
@@ -435,8 +417,13 @@ The function cv::divide divides one array by another:
 or a scalar by an array when there is no src1 :
 \f[\texttt{dst(I) = saturate(scale/src2(I))}\f]

-When src2(I) is zero, dst(I) will also be zero. Different channels of
-multi-channel arrays are processed independently.
+Different channels of multi-channel arrays are processed independently.
+
+For integer types when src2(I) is zero, dst(I) will also be zero.
+
+@note In case of floating point data there is no special defined behavior for zero src2(I) values.
+Regular floating-point division is used.
+Expect correct IEEE-754 behaviour for floating-point data (with NaN, Inf result values).

@note Saturation is not applied when the output array has the depth CV_32S. You may even get
 result of an incorrect sign in the case of overflow.
@@ -475,9 +462,10 @@ The function can also be emulated with a matrix expression, for example:
 */
 CV_EXPORTS_W void scaleAdd(InputArray src1, double alpha, InputArray src2, OutputArray dst);

-/** @example AddingImagesTrackbar.cpp
+/** @example samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
+Check @ref tutorial_trackbar "the corresponding tutorial" for more details
+*/

- */
 /** @brief Calculates the weighted sum of two arrays.

 The function addWeighted calculates the weighted sum of two arrays as follows:
@@ -533,8 +521,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,

 /** @brief Converts an array to half precision floating number.

-This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point).  The input array has to have type of CV_32F or
-CV_16S to represent the bit depth.  If the input array is neither of them, the function will raise an error.
+This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point). CV_16S format is used to represent FP16 data.
+There are two use modes (src -> dst): CV_32F -> CV_16S and CV_16S -> CV_32F. The input array has to have type of CV_32F or
+CV_16S to represent the bit depth. If the input array is neither of them, the function will raise an error.
 The format of half precision floating point is defined in IEEE 754-2008.

@param src input array.
@@ -599,7 +588,7 @@ or
    // access pixel coordinates
    Point pnt = locations[i];
@endcode
-@param src single-channel array (type CV_8UC1)
+@param src single-channel array
@param idx the output array, type of cv::Mat or std::vector<Point>, corresponding to non-zero indices in the input
 */
 CV_EXPORTS_W void findNonZero( InputArray src, OutputArray idx );
@@ -642,7 +631,7 @@ CV_EXPORTS_W void meanStdDev(InputArray src, OutputArray mean, OutputArray stdde

 /** @brief Calculates the  absolute norm of an array.

-This version of cv::norm calculates the absolute norm of src1. The type of norm to calculate is specified using cv::NormTypes.
+This version of #norm calculates the absolute norm of src1. The type of norm to calculate is specified using #NormTypes.

 As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
 The \f$ L_{1}, L_{2} \f$ and \f$ L_{\infty} \f$ norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
@@ -664,7 +653,7 @@ It is notable that the \f$ L_{1} \f$ norm forms the upper and the \f$ L_{\infty}

 When the mask parameter is specified and it is not empty, the norm is

-If normType is not specified, NORM_L2 is used.
+If normType is not specified, #NORM_L2 is used.
 calculated only over the region specified by the mask.

 Multi-channel input arrays are treated as single-channel arrays, that is,
@@ -673,7 +662,7 @@ the results for all channels are combined.
 Hamming norms can only be calculated with CV_8U depth arrays.

@param src1 first input array.
-@param normType type of the norm (see cv::NormTypes).
+@param normType type of the norm (see #NormTypes).
@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
 */
 CV_EXPORTS_W double norm(InputArray src1, int normType = NORM_L2, InputArray mask = noArray());
@@ -682,24 +671,25 @@ CV_EXPORTS_W double norm(InputArray src1, int normType = NORM_L2, InputArray mas

 This version of cv::norm calculates the absolute difference norm
 or the relative difference norm of arrays src1 and src2.
-The type of norm to calculate is specified using cv::NormTypes.
+The type of norm to calculate is specified using #NormTypes.

@param src1 first input array.
@param src2 second input array of the same size and the same type as src1.
-@param normType type of the norm (cv::NormTypes).
+@param normType type of the norm (see #NormTypes).
@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
 */
 CV_EXPORTS_W double norm(InputArray src1, InputArray src2,
                         int normType = NORM_L2, InputArray mask = noArray());
 /** @overload
@param src first input array.
-@param normType type of the norm (see cv::NormTypes).
+@param normType type of the norm (see #NormTypes).
 */
 CV_EXPORTS double norm( const SparseMat& src, int normType );

 /** @brief Computes the Peak Signal-to-Noise Ratio (PSNR) image quality metric.

-This function calculates the Peak Signal-to-Noise Ratio (PSNR) image quality metric in decibels (dB), between two input arrays src1 and src2. Arrays must have depth CV_8U.
+This function calculates the Peak Signal-to-Noise Ratio (PSNR) image quality metric in decibels (dB),
+between two input arrays src1 and src2. The arrays must have the same type.

 The PSNR is calculated as follows:

@@ -707,13 +697,15 @@ The PSNR is calculated as follows:
 \texttt{PSNR} = 10 \cdot \log_{10}{\left( \frac{R^2}{MSE} \right) }
 \f]

-where R is the maximum integer value of depth CV_8U (255) and MSE is the mean squared error between the two arrays.
+where R is the maximum integer value of depth (e.g. 255 in the case of CV_8U data)
+and MSE is the mean squared error between the two arrays.

@param src1 first input array.
@param src2 second input array of the same size as src1.
+@param R the maximum pixel value (255 by default)

  */
-CV_EXPORTS_W double PSNR(InputArray src1, InputArray src2);
+CV_EXPORTS_W double PSNR(InputArray src1, InputArray src2, double R=255.);

 /** @brief naive nearest neighbor finder

@@ -859,11 +851,11 @@ CV_EXPORTS void minMaxLoc(const SparseMat& a, double* minVal,

 /** @brief Reduces a matrix to a vector.

-The function cv::reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
+The function #reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
 1D vectors and performing the specified operation on the vectors until a single row/column is
 obtained. For example, the function can be used to compute horizontal and vertical projections of a
-raster image. In case of REDUCE_MAX and REDUCE_MIN , the output image should have the same type as the source one.
-In case of REDUCE_SUM and REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
+raster image. In case of #REDUCE_MAX and #REDUCE_MIN , the output image should have the same type as the source one.
+In case of #REDUCE_SUM and #REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
 And multi-channel arrays are also supported in these two reduction modes.

 The following code demonstrates its usage for a single channel matrix.
@@ -876,7 +868,7 @@ And the following code demonstrates its usage for a two-channel matrix.
@param dst output vector. Its size and type is defined by dim and dtype parameters.
@param dim dimension index along which the matrix is reduced. 0 means that the matrix is reduced to
 a single row. 1 means that the matrix is reduced to a single column.
-@param rtype reduction operation that could be one of cv::ReduceTypes
+@param rtype reduction operation that could be one of #ReduceTypes
@param dtype when negative, the output vector will have the same type as the input matrix,
 otherwise, its type will be CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()).
@sa repeat
@@ -1063,19 +1055,19 @@ around both axes.
 CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);

 enum RotateFlags {
-    ROTATE_90_CLOCKWISE = 0, //Rotate 90 degrees clockwise
-    ROTATE_180 = 1, //Rotate 180 degrees clockwise
-    ROTATE_90_COUNTERCLOCKWISE = 2, //Rotate 270 degrees clockwise
+    ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
+    ROTATE_180 = 1, //!<Rotate 180 degrees clockwise
+    ROTATE_90_COUNTERCLOCKWISE = 2, //!<Rotate 270 degrees clockwise
 };
 /** @brief Rotates a 2D array in multiples of 90 degrees.
-The function rotate rotates the array in one of three different ways:
-*   Rotate by 90 degrees clockwise (rotateCode = ROTATE_90).
+The function cv::rotate rotates the array in one of three different ways:
+*   Rotate by 90 degrees clockwise (rotateCode = ROTATE_90_CLOCKWISE).
 *   Rotate by 180 degrees clockwise (rotateCode = ROTATE_180).
-*   Rotate by 270 degrees clockwise (rotateCode = ROTATE_270).
+*   Rotate by 270 degrees clockwise (rotateCode = ROTATE_90_COUNTERCLOCKWISE).
@param src input array.
@param dst output array of the same type as src.  The size is the same with ROTATE_180,
-and the rows and cols are switched for ROTATE_90 and ROTATE_270.
-@param rotateCode an enum to specify how to rotate the array; see the enum RotateFlags
+and the rows and cols are switched for ROTATE_90_CLOCKWISE and ROTATE_90_COUNTERCLOCKWISE.
+@param rotateCode an enum to specify how to rotate the array; see the enum #RotateFlags
@sa transpose , repeat , completeSymm, flip, RotateFlags
 */
 CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, int rotateCode);
@@ -1356,6 +1348,17 @@ You may even get a negative value in the case of overflow.
 */
 CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst);

+/** @brief  This is an overloaded member function, provided for convenience (python)
+Copies the matrix to another one.
+When the operation mask is specified, if the Mat::create call shown above reallocates the matrix, the newly allocated matrix is initialized with all zeros before copying the data.
+@param src source matrix.
+@param dst Destination matrix. If it does not have a proper size or type before the operation, it is
+reallocated.
+@param mask Operation mask of the same size as \*this. Its non-zero elements indicate which matrix
+elements need to be copied. The mask has to be of type CV_8U and can have 1 or multiple channels.
+*/
+
+void CV_EXPORTS_W copyTo(InputArray src, OutputArray dst, InputArray mask);
 /** @brief  Checks if array elements lie between the elements of two other arrays.

 The function checks the range as follows:
@@ -1739,20 +1742,21 @@ element is a 2D/3D vector to be transformed.
 */
 CV_EXPORTS_W void perspectiveTransform(InputArray src, OutputArray dst, InputArray m );

-/** @brief Copies the lower or the upper half of a square matrix to another half.
+/** @brief Copies the lower or the upper half of a square matrix to its another half.

-The function cv::completeSymm copies the lower half of a square matrix to
+The function cv::completeSymm copies the lower or the upper half of a square matrix to
 its another half. The matrix diagonal remains unchanged:
-*   \f$\texttt{mtx}_{ij}=\texttt{mtx}_{ji}\f$ for \f$i > j\f$ if
+ - \f$\texttt{m}_{ij}=\texttt{m}_{ji}\f$ for \f$i > j\f$ if
    lowerToUpper=false
-*   \f$\texttt{mtx}_{ij}=\texttt{mtx}_{ji}\f$ for \f$i < j\f$ if
+ - \f$\texttt{m}_{ij}=\texttt{m}_{ji}\f$ for \f$i < j\f$ if
    lowerToUpper=true
-@param mtx input-output floating-point square matrix.
+
+@param m input-output floating-point square matrix.
@param lowerToUpper operation flag; if true, the lower half is copied to
 the upper half. Otherwise, the upper half is copied to the lower half.
@sa flip, transpose
 */
-CV_EXPORTS_W void completeSymm(InputOutputArray mtx, bool lowerToUpper = false);
+CV_EXPORTS_W void completeSymm(InputOutputArray m, bool lowerToUpper = false);

 /** @brief Initializes a scaled identity matrix.

@@ -1802,15 +1806,15 @@ The function cv::invert inverts the matrix src and stores the result in dst
 the pseudo-inverse matrix (the dst matrix) so that norm(src\*dst - I) is
 minimal, where I is an identity matrix.

-In case of the DECOMP_LU method, the function returns non-zero value if
+In case of the #DECOMP_LU method, the function returns non-zero value if
 the inverse has been successfully calculated and 0 if src is singular.

-In case of the DECOMP_SVD method, the function returns the inverse
+In case of the #DECOMP_SVD method, the function returns the inverse
 condition number of src (the ratio of the smallest singular value to the
 largest singular value) and 0 if src is singular. The SVD method
 calculates a pseudo-inverse matrix if src is singular.

-Similarly to DECOMP_LU, the method DECOMP_CHOLESKY works only with
+Similarly to #DECOMP_LU, the method #DECOMP_CHOLESKY works only with
 non-singular square matrices that should also be symmetrical and
 positively defined. In this case, the function stores the inverted
 matrix in dst and returns non-zero. Otherwise, it returns 0.
@@ -1826,10 +1830,10 @@ CV_EXPORTS_W double invert(InputArray src, OutputArray dst, int flags = DECOMP_L

 The function cv::solve solves a linear system or least-squares problem (the
 latter is possible with SVD or QR methods, or by specifying the flag
-DECOMP_NORMAL ):
+#DECOMP_NORMAL ):
 \f[\texttt{dst} =  \arg \min _X \| \texttt{src1} \cdot \texttt{X} -  \texttt{src2} \|\f]

-If DECOMP_LU or DECOMP_CHOLESKY method is used, the function returns 1
+If #DECOMP_LU or #DECOMP_CHOLESKY method is used, the function returns 1
 if src1 (or \f$\texttt{src1}^T\texttt{src1}\f$ ) is non-singular. Otherwise,
 it returns 0. In the latter case, dst is not valid. Other methods find a
 pseudo-solution in case of a singular left-hand side part.
@@ -1841,7 +1845,7 @@ will not do the work. Use SVD::solveZ instead.
@param src1 input matrix on the left-hand side of the system.
@param src2 input matrix on the right-hand side of the system.
@param dst output solution.
-@param flags solution (matrix inversion) method (cv::DecompTypes)
+@param flags solution (matrix inversion) method (#DecompTypes)
@sa invert, SVD, eigen
 */
 CV_EXPORTS_W bool solve(InputArray src1, InputArray src2,
@@ -1857,7 +1861,7 @@ proper comparison predicate.

@param src input single-channel array.
@param dst output array of the same size and type as src.
-@param flags operation flags, a combination of cv::SortFlags
+@param flags operation flags, a combination of #SortFlags
@sa sortIdx, randShuffle
 */
 CV_EXPORTS_W void sort(InputArray src, OutputArray dst, int flags);
@@ -1893,6 +1897,7 @@ The function solveCubic finds the real roots of a cubic equation:
 The roots are stored in the roots array.
@param coeffs equation coefficients, an array of 3 or 4 elements.
@param roots output array of real roots that has 1 or 3 elements.
+@return number of real roots. It can be 0, 1 or 2.
 */
 CV_EXPORTS_W int solveCubic(InputArray coeffs, OutputArray roots);

@@ -1953,7 +1958,7 @@ the set of input vectors.
@param nsamples number of samples
@param covar output covariance matrix of the type ctype and square size.
@param mean input or output (depending on the flags) array as the average value of the input vectors.
-@param flags operation flags as a combination of cv::CovarFlags
+@param flags operation flags as a combination of #CovarFlags
@param ctype type of the matrixl; it equals 'CV_64F' by default.
@sa PCA, mulTransposed, Mahalanobis
@todo InputArrayOfArrays
@@ -1962,11 +1967,11 @@ CV_EXPORTS void calcCovarMatrix( const Mat* samples, int nsamples, Mat& covar, M
                                 int flags, int ctype = CV_64F);

 /** @overload
-@note use cv::COVAR_ROWS or cv::COVAR_COLS flag
+@note use #COVAR_ROWS or #COVAR_COLS flag
@param samples samples stored as rows/columns of a single matrix.
@param covar output covariance matrix of the type ctype and square size.
@param mean input or output (depending on the flags) array as the average value of the input vectors.
-@param flags operation flags as a combination of cv::CovarFlags
+@param flags operation flags as a combination of #CovarFlags
@param ctype type of the matrixl; it equals 'CV_64F' by default.
 */
 CV_EXPORTS_W void calcCovarMatrix( InputArray samples, OutputArray covar,
@@ -1976,10 +1981,20 @@ CV_EXPORTS_W void calcCovarMatrix( InputArray samples, OutputArray covar,
 CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
                             OutputArray eigenvectors, int maxComponents = 0);

+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           int maxComponents = 0);
+
 /** wrap PCA::operator() */
 CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
                             OutputArray eigenvectors, double retainedVariance);

+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           double retainedVariance);
+
 /** wrap PCA::project */
 CV_EXPORTS_W void PCAProject(InputArray data, InputArray mean,
                             InputArray eigenvectors, OutputArray result);
@@ -1999,8 +2014,8 @@ CV_EXPORTS_W void SVBackSubst( InputArray w, InputArray u, InputArray vt,

 The function cv::Mahalanobis calculates and returns the weighted distance between two vectors:
 \f[d( \texttt{vec1} , \texttt{vec2} )= \sqrt{\sum_{i,j}{\texttt{icovar(i,j)}\cdot(\texttt{vec1}(I)-\texttt{vec2}(I))\cdot(\texttt{vec1(j)}-\texttt{vec2(j)})} }\f]
-The covariance matrix may be calculated using the cv::calcCovarMatrix function and then inverted using
-the invert function (preferably using the cv::DECOMP_SVD method, as the most accurate).
+The covariance matrix may be calculated using the #calcCovarMatrix function and then inverted using
+the invert function (preferably using the #DECOMP_SVD method, as the most accurate).
@param v1 first 1D input vector.
@param v2 second 1D input vector.
@param icovar inverse covariance matrix.
@@ -2030,28 +2045,28 @@ is how 2D *CCS* spectrum looks:
 In case of 1D transform of a real vector, the output looks like the first row of the matrix above.

 So, the function chooses an operation mode depending on the flags and size of the input array:
-   If DFT_ROWS is set or the input array has a single row or single column, the function
-    performs a 1D forward or inverse transform of each row of a matrix when DFT_ROWS is set.
+-   If #DFT_ROWS is set or the input array has a single row or single column, the function
+    performs a 1D forward or inverse transform of each row of a matrix when #DFT_ROWS is set.
    Otherwise, it performs a 2D transform.
-   If the input array is real and DFT_INVERSE is not set, the function performs a forward 1D or
+-   If the input array is real and #DFT_INVERSE is not set, the function performs a forward 1D or
    2D transform:
-    -   When DFT_COMPLEX_OUTPUT is set, the output is a complex matrix of the same size as
+    -   When #DFT_COMPLEX_OUTPUT is set, the output is a complex matrix of the same size as
        input.
-    -   When DFT_COMPLEX_OUTPUT is not set, the output is a real matrix of the same size as
+    -   When #DFT_COMPLEX_OUTPUT is not set, the output is a real matrix of the same size as
        input. In case of 2D transform, it uses the packed format as shown above. In case of a
        single 1D transform, it looks like the first row of the matrix above. In case of
-        multiple 1D transforms (when using the DFT_ROWS flag), each row of the output matrix
+        multiple 1D transforms (when using the #DFT_ROWS flag), each row of the output matrix
        looks like the first row of the matrix above.
-   If the input array is complex and either DFT_INVERSE or DFT_REAL_OUTPUT are not set, the
+-   If the input array is complex and either #DFT_INVERSE or #DFT_REAL_OUTPUT are not set, the
    output is a complex array of the same size as input. The function performs a forward or
    inverse 1D or 2D transform of the whole input array or each row of the input array
    independently, depending on the flags DFT_INVERSE and DFT_ROWS.
-   When DFT_INVERSE is set and the input array is real, or it is complex but DFT_REAL_OUTPUT
+-   When #DFT_INVERSE is set and the input array is real, or it is complex but #DFT_REAL_OUTPUT
    is set, the output is a real array of the same size as input. The function performs a 1D or 2D
    inverse transformation of the whole input array or each individual row, depending on the flags
-    DFT_INVERSE and DFT_ROWS.
+    #DFT_INVERSE and #DFT_ROWS.

-If DFT_SCALE is set, the scaling is done after the transformation.
+If #DFT_SCALE is set, the scaling is done after the transformation.

 Unlike dct , the function supports arrays of arbitrary size. But only those arrays are processed
 efficiently, whose sizes can be factorized in a product of small prime numbers (2, 3, and 5 in the
@@ -2117,7 +2132,7 @@ To optimize this sample, consider the following approaches:
 -   If different tiles in C can be calculated in parallel and, thus, the convolution is done by
    parts, the loop can be threaded.

-All of the above improvements have been implemented in matchTemplate and filter2D . Therefore, by
+All of the above improvements have been implemented in #matchTemplate and #filter2D . Therefore, by
 using them, you can get the performance even better than with the above theoretically optimal
 implementation. Though, those two functions actually calculate cross-correlation, not convolution,
 so you need to "flip" the second convolution operand B vertically and horizontally using flip .
@@ -2130,10 +2145,10 @@ so you need to "flip" the second convolution operand B vertically and horizontal
    opencv_source/samples/python/dft.py
@param src input array that could be real or complex.
@param dst output array whose size and type depends on the flags .
-@param flags transformation flags, representing a combination of the cv::DftFlags
+@param flags transformation flags, representing a combination of the #DftFlags
@param nonzeroRows when the parameter is not zero, the function assumes that only the first
-nonzeroRows rows of the input array (DFT_INVERSE is not set) or only the first nonzeroRows of the
-output array (DFT_INVERSE is set) contain non-zeros, thus, the function can handle the rest of the
+nonzeroRows rows of the input array (#DFT_INVERSE is not set) or only the first nonzeroRows of the
+output array (#DFT_INVERSE is set) contain non-zeros, thus, the function can handle the rest of the
 rows more efficiently and save some time; this technique is very useful for calculating array
 cross-correlation or convolution using DFT.
@sa dct , getOptimalDFTSize , mulSpectrums, filter2D , matchTemplate , flip , cartToPolar ,
@@ -2143,13 +2158,13 @@ CV_EXPORTS_W void dft(InputArray src, OutputArray dst, int flags = 0, int nonzer

 /** @brief Calculates the inverse Discrete Fourier Transform of a 1D or 2D array.

-idft(src, dst, flags) is equivalent to dft(src, dst, flags | DFT_INVERSE) .
-@note None of dft and idft scales the result by default. So, you should pass DFT_SCALE to one of
+idft(src, dst, flags) is equivalent to dft(src, dst, flags | #DFT_INVERSE) .
+@note None of dft and idft scales the result by default. So, you should pass #DFT_SCALE to one of
 dft or idft explicitly to make these transforms mutually inverse.
@sa dft, dct, idct, mulSpectrums, getOptimalDFTSize
@param src input floating-point real or complex array.
@param dst output array whose size and type depend on the flags.
-@param flags operation flags (see dft and cv::DftFlags).
+@param flags operation flags (see dft and #DftFlags).
@param nonzeroRows number of dst rows to process; the rest of the rows have undefined content (see
 the convolution sample in dft description.
 */
@@ -2174,9 +2189,9 @@ floating-point array:
    \f[X =  \left (C^{(N)} \right )^T  \cdot X  \cdot C^{(N)}\f]

 The function chooses the mode of operation by looking at the flags and size of the input array:
-   If (flags & DCT_INVERSE) == 0 , the function does a forward 1D or 2D transform. Otherwise, it
+-   If (flags & #DCT_INVERSE) == 0 , the function does a forward 1D or 2D transform. Otherwise, it
    is an inverse 1D or 2D transform.
-   If (flags & DCT_ROWS) != 0 , the function performs a 1D transform of each row.
+-   If (flags & #DCT_ROWS) != 0 , the function performs a 1D transform of each row.
 -   If the array is a single column or a single row, the function performs a 1D transform.
 -   If none of the above is true, the function performs a 2D transform.

@@ -2515,14 +2530,18 @@ public:
    Mat mean; //!< mean value subtracted before the projection and added after the back projection
 };

-/** @example pca.cpp
-  An example using %PCA for dimensionality reduction while maintaining an amount of variance
- */
+/** @example samples/cpp/pca.cpp
+An example using %PCA for dimensionality reduction while maintaining an amount of variance
+*/
+
+/** @example samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+Check @ref tutorial_introduction_to_pca "the corresponding tutorial" for more details
+*/

 /**
-   @brief Linear Discriminant Analysis
-   @todo document this class
- */
+@brief Linear Discriminant Analysis
+@todo document this class
+*/
 class CV_EXPORTS LDA
 {
 public:
@@ -2584,7 +2603,6 @@ public:
    static Mat subspaceReconstruct(InputArray W, InputArray mean, InputArray src);

 protected:
-    bool _dataAsRow; // unused, but needed for 3.0 ABI compatibility.
    int _num_components;
    Mat _eigenvectors;
    Mat _eigenvalues;
@@ -2629,7 +2647,7 @@ public:

    /** @overload
    initializes an empty SVD structure and then calls SVD::operator()
-    @param src decomposed matrix.
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
    @param flags operation flags (SVD::Flags)
      */
    SVD( InputArray src, int flags = 0 );
@@ -2642,7 +2660,7 @@ public:
    different matrices. Each time, if needed, the previous u,`vt` , and w
    are reclaimed and the new matrices are created, which is all handled by
    Mat::create.
-    @param src decomposed matrix.
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
    @param flags operation flags (SVD::Flags)
      */
    SVD& operator ()( InputArray src, int flags = 0 );
@@ -2658,18 +2676,18 @@ public:
    SVD::compute(A, w, u, vt);
    @endcode

-    @param src decomposed matrix
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
    @param w calculated singular values
    @param u calculated left singular vectors
-    @param vt transposed matrix of right singular values
-    @param flags operation flags - see SVD::SVD.
+    @param vt transposed matrix of right singular vectors
+    @param flags operation flags - see SVD::Flags.
      */
    static void compute( InputArray src, OutputArray w,
                         OutputArray u, OutputArray vt, int flags = 0 );

    /** @overload
    computes singular values of a matrix
-    @param src decomposed matrix
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
    @param w calculated singular values
    @param flags operation flags - see SVD::Flags.
      */
@@ -2713,7 +2731,7 @@ public:
    if you need to solve many linear systems with the same left-hand side
    (for example, src ). If all you need is to solve a single system
    (possibly with multiple rhs immediately available), simply call solve
-    add pass DECOMP_SVD there. It does absolutely the same thing.
+    add pass #DECOMP_SVD there. It does absolutely the same thing.
      */
    void backSubst( InputArray rhs, OutputArray dst ) const;

@@ -2838,7 +2856,7 @@ public:
    use explicit type cast operators, as in the a1 initialization above.
    @param a lower inclusive boundary of the returned random number.
    @param b upper non-inclusive boundary of the returned random number.
-      */
+    */
    int uniform(int a, int b);
    /** @overload */
    float uniform(float a, float b);
@@ -2900,7 +2918,7 @@ public:

 Inspired by http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c
@todo document
- */
+*/
 class CV_EXPORTS RNG_MT19937
 {
 public:
@@ -2918,17 +2936,11 @@ public:
    unsigned operator ()(unsigned N);
    unsigned operator ()();

-    /** @brief returns uniformly distributed integer random number from [a,b) range
-
-*/
+    /** @brief returns uniformly distributed integer random number from [a,b) range*/
    int uniform(int a, int b);
-    /** @brief returns uniformly distributed floating-point random number from [a,b) range
-
-*/
+    /** @brief returns uniformly distributed floating-point random number from [a,b) range*/
    float uniform(float a, float b);
-    /** @brief returns uniformly distributed double-precision floating-point random number from [a,b) range
-
-*/
+    /** @brief returns uniformly distributed double-precision floating-point random number from [a,b) range*/
    double uniform(double a, double b);

 private:
@@ -2942,14 +2954,14 @@ private:
 //! @addtogroup core_cluster
 //!  @{

-/** @example kmeans.cpp
-  An example on K-means clustering
+/** @example samples/cpp/kmeans.cpp
+An example on K-means clustering
 */

 /** @brief Finds centers of clusters and groups input samples around the clusters.

 The function kmeans implements a k-means algorithm that finds the centers of cluster_count clusters
-and groups the input samples around the clusters. As an output, \f$\texttt{labels}_i\f$ contains a
+and groups the input samples around the clusters. As an output, \f$\texttt{bestLabels}_i\f$ contains a
 0-based cluster index for the sample stored in the \f$i^{th}\f$ row of the samples matrix.

@note
@@ -2976,7 +2988,7 @@ function parameter).
 after every attempt. The best (minimum) value is chosen and the corresponding labels and the
 compactness value are returned by the function. Basically, you can use only the core of the
 function, set the number of attempts to 1, initialize labels each time using a custom algorithm,
-pass them with the ( flags = KMEANS_USE_INITIAL_LABELS ) flag, and then choose the best
+pass them with the ( flags = #KMEANS_USE_INITIAL_LABELS ) flag, and then choose the best
 (most-compact) clustering.
 */
 CV_EXPORTS_W double kmeans( InputArray data, int K, InputOutputArray bestLabels,
@@ -3003,7 +3015,8 @@ public:
 class CV_EXPORTS Formatter
 {
 public:
-    enum { FMT_DEFAULT = 0,
+    enum FormatType {
+           FMT_DEFAULT = 0,
           FMT_MATLAB  = 1,
           FMT_CSV     = 2,
           FMT_PYTHON  = 3,
@@ -3015,11 +3028,12 @@ public:

    virtual Ptr<Formatted> format(const Mat& mtx) const = 0;

+    virtual void set16fPrecision(int p = 4) = 0;
    virtual void set32fPrecision(int p = 8) = 0;
    virtual void set64fPrecision(int p = 16) = 0;
    virtual void setMultiline(bool ml = true) = 0;

-    static Ptr<Formatter> get(int fmt = FMT_DEFAULT);
+    static Ptr<Formatter> get(Formatter::FormatType fmt = FMT_DEFAULT);

 };

@@ -3042,7 +3056,7 @@ String& operator << (String& out, const Mat& mtx)

 class CV_EXPORTS Algorithm;

-template<typename _Tp> struct ParamType {};
+template<typename _Tp, typename _EnumTp = void> struct ParamType {};


 /** @brief This is a base class for all more or less complex algorithms in OpenCV
@@ -3053,32 +3067,9 @@ matching, graph-cut etc.), background subtraction (which can be done using mixtu
 models, codebook-based algorithm etc.), optical flow (block matching, Lucas-Kanade, Horn-Schunck
 etc.).

-Here is example of SIFT use in your application via Algorithm interface:
-@code
-    #include "opencv2/opencv.hpp"
-    #include "opencv2/xfeatures2d.hpp"
-    using namespace cv::xfeatures2d;
-
-    Ptr<Feature2D> sift = SIFT::create();
-    FileStorage fs("sift_params.xml", FileStorage::READ);
-    if( fs.isOpened() ) // if we have file with parameters, read them
-    {
-        sift->read(fs["sift_params"]);
-        fs.release();
-    }
-    else // else modify the parameters and store them; user can later edit the file to use different parameters
-    {
-        sift->setContrastThreshold(0.01f); // lower the contrast threshold, compared to the default value
-        {
-            WriteStructContext ws(fs, "sift_params", CV_NODE_MAP);
-            sift->write(fs);
-        }
-    }
-    Mat image = imread("myimage.png", 0), descriptors;
-    vector<KeyPoint> keypoints;
-    sift->detectAndCompute(image, noArray(), keypoints, descriptors);
-@endcode
- */
+Here is example of SimpleBlobDetector use in your application via Algorithm interface:
+@snippet snippets/core_various.cpp Algorithm
+*/
 class CV_EXPORTS_W Algorithm
 {
 public:
@@ -3091,32 +3082,32 @@ public:

    /** @brief Stores algorithm parameters in a file storage
    */
-    virtual void write(FileStorage& fs) const { (void)fs; }
+    virtual void write(FileStorage& fs) const { CV_UNUSED(fs); }

    /** @brief simplified API for language bindings
-     * @overload
-     */
+    * @overload
+    */
    CV_WRAP void write(const Ptr<FileStorage>& fs, const String& name = String()) const;

    /** @brief Reads algorithm parameters from a file storage
    */
-    CV_WRAP virtual void read(const FileNode& fn) { (void)fn; }
+    CV_WRAP virtual void read(const FileNode& fn) { CV_UNUSED(fn); }

    /** @brief Returns true if the Algorithm is empty (e.g. in the very beginning or after unsuccessful read
-     */
+    */
    CV_WRAP virtual bool empty() const { return false; }

    /** @brief Reads algorithm from the file node

-     This is static template method of Algorithm. It's usage is following (in the case of SVM):
-     @code
-     cv::FileStorage fsRead("example.xml", FileStorage::READ);
-     Ptr<SVM> svm = Algorithm::read<SVM>(fsRead.root());
-     @endcode
-     In order to make this method work, the derived class must overwrite Algorithm::read(const
-     FileNode& fn) and also have static create() method without parameters
-     (or with all the optional parameters)
-     */
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    cv::FileStorage fsRead("example.xml", FileStorage::READ);
+    Ptr<SVM> svm = Algorithm::read<SVM>(fsRead.root());
+    @endcode
+    In order to make this method work, the derived class must overwrite Algorithm::read(const
+    FileNode& fn) and also have static create() method without parameters
+    (or with all the optional parameters)
+    */
    template<typename _Tp> static Ptr<_Tp> read(const FileNode& fn)
    {
        Ptr<_Tp> obj = _Tp::create();
@@ -3126,16 +3117,16 @@ public:

    /** @brief Loads algorithm from the file

-     @param filename Name of the file to read.
-     @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+    @param filename Name of the file to read.
+    @param objname The optional name of the node to read (if empty, the first top-level node will be used)

-     This is static template method of Algorithm. It's usage is following (in the case of SVM):
-     @code
-     Ptr<SVM> svm = Algorithm::load<SVM>("my_svm_model.xml");
-     @endcode
-     In order to make this method work, the derived class must overwrite Algorithm::read(const
-     FileNode& fn).
-     */
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    Ptr<SVM> svm = Algorithm::load<SVM>("my_svm_model.xml");
+    @endcode
+    In order to make this method work, the derived class must overwrite Algorithm::read(const
+    FileNode& fn).
+    */
    template<typename _Tp> static Ptr<_Tp> load(const String& filename, const String& objname=String())
    {
        FileStorage fs(filename, FileStorage::READ);
@@ -3149,14 +3140,14 @@ public:

    /** @brief Loads algorithm from a String

-     @param strModel The string variable containing the model you want to load.
-     @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+    @param strModel The string variable containing the model you want to load.
+    @param objname The optional name of the node to read (if empty, the first top-level node will be used)

-     This is static template method of Algorithm. It's usage is following (in the case of SVM):
-     @code
-     Ptr<SVM> svm = Algorithm::loadFromString<SVM>(myStringModel);
-     @endcode
-     */
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    Ptr<SVM> svm = Algorithm::loadFromString<SVM>(myStringModel);
+    @endcode
+    */
    template<typename _Tp> static Ptr<_Tp> loadFromString(const String& strModel, const String& objname=String())
    {
        FileStorage fs(strModel, FileStorage::READ + FileStorage::MEMORY);
@@ -3167,20 +3158,20 @@ public:
    }

    /** Saves the algorithm to a file.
-     In order to make this method work, the derived class must implement Algorithm::write(FileStorage& fs). */
+    In order to make this method work, the derived class must implement Algorithm::write(FileStorage& fs). */
    CV_WRAP virtual void save(const String& filename) const;

    /** Returns the algorithm string identifier.
-     This string is used as top level xml/yml node tag when the object is saved to a file or string. */
+    This string is used as top level xml/yml node tag when the object is saved to a file or string. */
    CV_WRAP virtual String getDefaultName() const;

 protected:
    void writeFormat(FileStorage& fs) const;
 };

-struct Param {
-    enum { INT=0, BOOLEAN=1, REAL=2, STRING=3, MAT=4, MAT_VECTOR=5, ALGORITHM=6, FLOAT=7,
-           UNSIGNED_INT=8, UINT64=9, UCHAR=11 };
+enum struct Param {
+    INT=0, BOOLEAN=1, REAL=2, STRING=3, MAT=4, MAT_VECTOR=5, ALGORITHM=6, FLOAT=7,
+    UNSIGNED_INT=8, UINT64=9, UCHAR=11, SCALAR=12
 };


@@ -3190,7 +3181,7 @@ template<> struct ParamType<bool>
    typedef bool const_param_type;
    typedef bool member_type;

-    enum { type = Param::BOOLEAN };
+    static const Param type = Param::BOOLEAN;
 };

 template<> struct ParamType<int>
@@ -3198,7 +3189,7 @@ template<> struct ParamType<int>
    typedef int const_param_type;
    typedef int member_type;

-    enum { type = Param::INT };
+    static const Param type = Param::INT;
 };

 template<> struct ParamType<double>
@@ -3206,7 +3197,7 @@ template<> struct ParamType<double>
    typedef double const_param_type;
    typedef double member_type;

-    enum { type = Param::REAL };
+    static const Param type = Param::REAL;
 };

 template<> struct ParamType<String>
@@ -3214,7 +3205,7 @@ template<> struct ParamType<String>
    typedef const String& const_param_type;
    typedef String member_type;

-    enum { type = Param::STRING };
+    static const Param type = Param::STRING;
 };

 template<> struct ParamType<Mat>
@@ -3222,7 +3213,7 @@ template<> struct ParamType<Mat>
    typedef const Mat& const_param_type;
    typedef Mat member_type;

-    enum { type = Param::MAT };
+    static const Param type = Param::MAT;
 };

 template<> struct ParamType<std::vector<Mat> >
@@ -3230,7 +3221,7 @@ template<> struct ParamType<std::vector<Mat> >
    typedef const std::vector<Mat>& const_param_type;
    typedef std::vector<Mat> member_type;

-    enum { type = Param::MAT_VECTOR };
+    static const Param type = Param::MAT_VECTOR;
 };

 template<> struct ParamType<Algorithm>
@@ -3238,7 +3229,7 @@ template<> struct ParamType<Algorithm>
    typedef const Ptr<Algorithm>& const_param_type;
    typedef Ptr<Algorithm> member_type;

-    enum { type = Param::ALGORITHM };
+    static const Param type = Param::ALGORITHM;
 };

 template<> struct ParamType<float>
@@ -3246,7 +3237,7 @@ template<> struct ParamType<float>
    typedef float const_param_type;
    typedef float member_type;

-    enum { type = Param::FLOAT };
+    static const Param type = Param::FLOAT;
 };

 template<> struct ParamType<unsigned>
@@ -3254,7 +3245,7 @@ template<> struct ParamType<unsigned>
    typedef unsigned const_param_type;
    typedef unsigned member_type;

-    enum { type = Param::UNSIGNED_INT };
+    static const Param type = Param::UNSIGNED_INT;
 };

 template<> struct ParamType<uint64>
@@ -3262,7 +3253,7 @@ template<> struct ParamType<uint64>
    typedef uint64 const_param_type;
    typedef uint64 member_type;

-    enum { type = Param::UINT64 };
+    static const Param type = Param::UINT64;
 };

 template<> struct ParamType<uchar>
@@ -3270,7 +3261,24 @@ template<> struct ParamType<uchar>
    typedef uchar const_param_type;
    typedef uchar member_type;

-    enum { type = Param::UCHAR };
+    static const Param type = Param::UCHAR;
+};
+
+template<> struct ParamType<Scalar>
+{
+    typedef const Scalar& const_param_type;
+    typedef Scalar member_type;
+
+    static const Param type = Param::SCALAR;
+};
+
+template<typename _Tp>
+struct ParamType<_Tp, typename std::enable_if< std::is_enum<_Tp>::value >::type>
+{
+    typedef typename std::underlying_type<_Tp>::type const_param_type;
+    typedef typename std::underlying_type<_Tp>::type member_type;
+
+    static const Param type = Param::INT;
 };

 //! @} core_basic
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/affine.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/affine.hpp
@@ -55,7 +55,72 @@ namespace cv
 //! @{

    /** @brief Affine transform
-      @todo document
+     *
+     * It represents a 4x4 homogeneous transformation matrix \f$T\f$
+     *
+     *  \f[T =
+     *  \begin{bmatrix}
+     *  R & t\\
+     *  0 & 1\\
+     *  \end{bmatrix}
+     *  \f]
+     *
+     *  where \f$R\f$ is a 3x3 rotation matrix and \f$t\f$ is a 3x1 translation vector.
+     *
+     *  You can specify \f$R\f$ either by a 3x3 rotation matrix or by a 3x1 rotation vector,
+     *  which is converted to a 3x3 rotation matrix by the Rodrigues formula.
+     *
+     *  To construct a matrix \f$T\f$ representing first rotation around the axis \f$r\f$ with rotation
+     *  angle \f$|r|\f$ in radian (right hand rule) and then translation by the vector \f$t\f$, you can use
+     *
+     *  @code
+     *  cv::Vec3f r, t;
+     *  cv::Affine3f T(r, t);
+     *  @endcode
+     *
+     *  If you already have the rotation matrix \f$R\f$, then you can use
+     *
+     *  @code
+     *  cv::Matx33f R;
+     *  cv::Affine3f T(R, t);
+     *  @endcode
+     *
+     *  To extract the rotation matrix \f$R\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Matx33f R = T.rotation();
+     *  @endcode
+     *
+     *  To extract the translation vector \f$t\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Vec3f t = T.translation();
+     *  @endcode
+     *
+     *  To extract the rotation vector \f$r\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Vec3f r = T.rvec();
+     *  @endcode
+     *
+     *  Note that since the mapping from rotation vectors to rotation matrices
+     *  is many to one. The returned rotation vector is not necessarily the one
+     *  you used before to set the matrix.
+     *
+     *  If you have two transformations \f$T = T_1 * T_2\f$, use
+     *
+     *  @code
+     *  cv::Affine3f T, T1, T2;
+     *  T = T2.concatenate(T1);
+     *  @endcode
+     *
+     *  To get the inverse transform of \f$T\f$, use
+     *
+     *  @code
+     *  cv::Affine3f T, T_inv;
+     *  T_inv = T.inv();
+     *  @endcode
+     *
     */
    template<typename T>
    class Affine3
@@ -66,45 +131,127 @@ namespace cv
        typedef Matx<float_type, 4, 4> Mat4;
        typedef Vec<float_type, 3> Vec3;

+       //! Default constructor. It represents a 4x4 identity matrix.
        Affine3();

        //! Augmented affine matrix
        Affine3(const Mat4& affine);

-        //! Rotation matrix
+        /**
+         *  The resulting 4x4 matrix is
+         *
+         *  \f[
+         *  \begin{bmatrix}
+         *  R & t\\
+         *  0 & 1\\
+         *  \end{bmatrix}
+         *  \f]
+         *
+         * @param R 3x3 rotation matrix.
+         * @param t 3x1 translation vector.
+         */
        Affine3(const Mat3& R, const Vec3& t = Vec3::all(0));

-        //! Rodrigues vector
+        /**
+         * Rodrigues vector.
+         *
+         * The last row of the current matrix is set to [0,0,0,1].
+         *
+         * @param rvec 3x1 rotation vector. Its direction indicates the rotation axis and its length
+         *             indicates the rotation angle in radian (using right hand rule).
+         * @param t 3x1 translation vector.
+         */
        Affine3(const Vec3& rvec, const Vec3& t = Vec3::all(0));

-        //! Combines all contructors above. Supports 4x4, 4x3, 3x3, 1x3, 3x1 sizes of data matrix
+        /**
+         * Combines all constructors above. Supports 4x4, 3x4, 3x3, 1x3, 3x1 sizes of data matrix.
+         *
+         * The last row of the current matrix is set to [0,0,0,1] when data is not 4x4.
+         *
+         * @param data 1-channel matrix.
+         *             when it is 4x4, it is copied to the current matrix and t is not used.
+         *             When it is 3x4, it is copied to the upper part 3x4 of the current matrix and t is not used.
+         *             When it is 3x3, it is copied to the upper left 3x3 part of the current matrix.
+         *             When it is 3x1 or 1x3, it is treated as a rotation vector and the Rodrigues formula is used
+         *                             to compute a 3x3 rotation matrix.
+         * @param t 3x1 translation vector. It is used only when data is neither 4x4 nor 3x4.
+         */
        explicit Affine3(const Mat& data, const Vec3& t = Vec3::all(0));

-        //! From 16th element array
+        //! From 16-element array
        explicit Affine3(const float_type* vals);

-        //! Create identity transform
+        //! Create an 4x4 identity transform
        static Affine3 Identity();

-        //! Rotation matrix
+        /**
+         * Rotation matrix.
+         *
+         * Copy the rotation matrix to the upper left 3x3 part of the current matrix.
+         * The remaining elements of the current matrix are not changed.
+         *
+         * @param R 3x3 rotation matrix.
+         *
+         */
        void rotation(const Mat3& R);

-        //! Rodrigues vector
+        /**
+         * Rodrigues vector.
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param rvec 3x1 rotation vector. The direction indicates the rotation axis and
+         *             its length indicates the rotation angle in radian (using the right thumb convention).
+         */
        void rotation(const Vec3& rvec);

-        //! Combines rotation methods above. Suports 3x3, 1x3, 3x1 sizes of data matrix;
+        /**
+         * Combines rotation methods above. Supports 3x3, 1x3, 3x1 sizes of data matrix.
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param data 1-channel matrix.
+         *             When it is a 3x3 matrix, it sets the upper left 3x3 part of the current matrix.
+         *             When it is a 1x3 or 3x1 matrix, it is used as a rotation vector. The Rodrigues formula
+         *             is used to compute the rotation matrix and sets the upper left 3x3 part of the current matrix.
+         */
        void rotation(const Mat& data);

+        /**
+         * Copy the 3x3 matrix L to the upper left part of the current matrix
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param L 3x3 matrix.
+         */
        void linear(const Mat3& L);
+
+        /**
+         * Copy t to the first three elements of the last column of the current matrix
+         *
+         * It sets the upper right 3x1 part of the matrix. The remaining part is unaffected.
+         *
+         * @param t 3x1 translation vector.
+         */
        void translation(const Vec3& t);

+        //! @return the upper left 3x3 part
        Mat3 rotation() const;
+
+        //! @return the upper left 3x3 part
        Mat3 linear() const;
+
+        //! @return the upper right 3x1 part
        Vec3 translation() const;

-        //! Rodrigues vector
+        //! Rodrigues vector.
+        //! @return a vector representing the upper left 3x3 rotation matrix of the current matrix.
+        //! @warning  Since the mapping between rotation vectors and rotation matrices is many to one,
+        //!           this function returns only one rotation vector that represents the current rotation matrix,
+        //!           which is not necessarily the same one set by `rotation(const Vec3& rvec)`.
        Vec3 rvec() const;

+        //! @return the inverse of the current matrix.
        Affine3 inv(int method = cv::DECOMP_SVD) const;

        //! a.rotate(R) is equivalent to Affine(R, 0) * a;
@@ -113,7 +260,7 @@ namespace cv
        //! a.rotate(rvec) is equivalent to Affine(rvec, 0) * a;
        Affine3 rotate(const Vec3& rvec) const;

-        //! a.translate(t) is equivalent to Affine(E, t) * a;
+        //! a.translate(t) is equivalent to Affine(E, t) * a, where E is an identity matrix
        Affine3 translate(const Vec3& t) const;

        //! a.concatenate(affine) is equivalent to affine * a;
@@ -136,6 +283,7 @@ namespace cv
    template<typename T> static
    Affine3<T> operator*(const Affine3<T>& affine1, const Affine3<T>& affine2);

+    //! V is a 3-element vector with member fields x, y and z
    template<typename T, typename V> static
    V operator*(const Affine3<T>& affine, const V& vector);

@@ -178,7 +326,7 @@ namespace cv
 //! @cond IGNORED

 ///////////////////////////////////////////////////////////////////////////////////
-// Implementaiton
+// Implementation

 template<typename T> inline
 cv::Affine3<T>::Affine3()
@@ -212,6 +360,7 @@ template<typename T> inline
 cv::Affine3<T>::Affine3(const cv::Mat& data, const Vec3& t)
 {
    CV_Assert(data.type() == cv::traits::Type<T>::value);
+    CV_Assert(data.channels() == 1);

    if (data.cols == 4 && data.rows == 4)
    {
@@ -276,11 +425,12 @@ void cv::Affine3<T>::rotation(const Vec3& _rvec)
    }
 }

-//Combines rotation methods above. Suports 3x3, 1x3, 3x1 sizes of data matrix;
+//Combines rotation methods above. Supports 3x3, 1x3, 3x1 sizes of data matrix;
 template<typename T> inline
 void cv::Affine3<T>::rotation(const cv::Mat& data)
 {
    CV_Assert(data.type() == cv::traits::Type<T>::value);
+    CV_Assert(data.channels() == 1);

    if (data.cols == 3 && data.rows == 3)
    {
@@ -295,7 +445,7 @@ void cv::Affine3<T>::rotation(const cv::Mat& data)
        rotation(_rvec);
    }
    else
-        CV_Assert(!"Input marix can be 3x3, 1x3 or 3x1");
+        CV_Error(Error::StsError, "Input matrix can only be 3x3, 1x3 or 3x1");
 }

 template<typename T> inline
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/base.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/base.hpp
@@ -271,7 +271,7 @@ enum BorderTypes {
    BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
    BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
    BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
-    BORDER_TRANSPARENT = 5, //!< `uvwxyz|absdefgh|ijklmno`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`

    BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
    BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
@@ -283,84 +283,6 @@ enum BorderTypes {
 //! @addtogroup core_utils
 //! @{

-//! @cond IGNORED
-
-//////////////// static assert /////////////////
-#define CVAUX_CONCAT_EXP(a, b) a##b
-#define CVAUX_CONCAT(a, b) CVAUX_CONCAT_EXP(a,b)
-
-#if defined(__clang__)
-#  ifndef __has_extension
-#    define __has_extension __has_feature /* compatibility, for older versions of clang */
-#  endif
-#  if __has_extension(cxx_static_assert)
-#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
-#  elif __has_extension(c_static_assert)
-#    define CV_StaticAssert(condition, reason)    _Static_assert((condition), reason " " #condition)
-#  endif
-#elif defined(__GNUC__)
-#  if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
-#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
-#  endif
-#elif defined(_MSC_VER)
-#  if _MSC_VER >= 1600 /* MSVC 10 */
-#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
-#  endif
-#endif
-#ifndef CV_StaticAssert
-#  if !defined(__clang__) && defined(__GNUC__) && (__GNUC__*100 + __GNUC_MINOR__ > 302)
-#    define CV_StaticAssert(condition, reason) ({ extern int __attribute__((error("CV_StaticAssert: " reason " " #condition))) CV_StaticAssert(); ((condition) ? 0 : CV_StaticAssert()); })
-#  else
-     template <bool x> struct CV_StaticAssert_failed;
-     template <> struct CV_StaticAssert_failed<true> { enum { val = 1 }; };
-     template<int x> struct CV_StaticAssert_test {};
-#    define CV_StaticAssert(condition, reason)\
-       typedef cv::CV_StaticAssert_test< sizeof(cv::CV_StaticAssert_failed< static_cast<bool>(condition) >) > CVAUX_CONCAT(CV_StaticAssert_failed_at_, __LINE__)
-#  endif
-#endif
-
-// Suppress warning "-Wdeprecated-declarations" / C4996
-#if defined(_MSC_VER)
-    #define CV_DO_PRAGMA(x) __pragma(x)
-#elif defined(__GNUC__)
-    #define CV_DO_PRAGMA(x) _Pragma (#x)
-#else
-    #define CV_DO_PRAGMA(x)
-#endif
-
-#ifdef _MSC_VER
-#define CV_SUPPRESS_DEPRECATED_START \
-    CV_DO_PRAGMA(warning(push)) \
-    CV_DO_PRAGMA(warning(disable: 4996))
-#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(warning(pop))
-#elif defined (__clang__) || ((__GNUC__)  && (__GNUC__*100 + __GNUC_MINOR__ > 405))
-#define CV_SUPPRESS_DEPRECATED_START \
-    CV_DO_PRAGMA(GCC diagnostic push) \
-    CV_DO_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
-#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(GCC diagnostic pop)
-#else
-#define CV_SUPPRESS_DEPRECATED_START
-#define CV_SUPPRESS_DEPRECATED_END
-#endif
-
-#define CV_UNUSED(name) (void)name
-
-#if defined __GNUC__ && !defined __EXCEPTIONS
-#define CV_TRY
-#define CV_CATCH(A, B) for (A B; false; )
-#define CV_CATCH_ALL if (false)
-#define CV_THROW(A) abort()
-#define CV_RETHROW() abort()
-#else
-#define CV_TRY try
-#define CV_CATCH(A, B) catch(const A & B)
-#define CV_CATCH_ALL catch(...)
-#define CV_THROW(A) throw A
-#define CV_RETHROW() throw
-#endif
-
-//! @endcond
-
 /*! @brief Signals an error and raises the exception.

 By default the function prints information about the error to stderr,
@@ -371,51 +293,17 @@ It is possible to alternate error processing by using redirectError().
@param _func - function name. Available only when the compiler supports getting it
@param _file - source file name where the error has occurred
@param _line - line number in the source file where the error has occurred
-@see CV_Error, CV_Error_, CV_ErrorNoReturn, CV_ErrorNoReturn_, CV_Assert, CV_DbgAssert
+@see CV_Error, CV_Error_, CV_Assert, CV_DbgAssert
 */
-CV_EXPORTS void error(int _code, const String& _err, const char* _func, const char* _file, int _line);
-
-#ifdef __GNUC__
-# if defined __clang__ || defined __APPLE__
-#   pragma GCC diagnostic push
-#   pragma GCC diagnostic ignored "-Winvalid-noreturn"
-# endif
-#endif
-
-/** same as cv::error, but does not return */
-CV_INLINE CV_NORETURN void errorNoReturn(int _code, const String& _err, const char* _func, const char* _file, int _line)
-{
-    error(_code, _err, _func, _file, _line);
-#ifdef __GNUC__
-# if !defined __clang__ && !defined __APPLE__
-    // this suppresses this warning: "noreturn" function does return [enabled by default]
-    __builtin_trap();
-    // or use infinite loop: for (;;) {}
-# endif
-#endif
-}
-#ifdef __GNUC__
-# if defined __clang__ || defined __APPLE__
-#   pragma GCC diagnostic pop
-# endif
-#endif
-
-#if defined __GNUC__
-#define CV_Func __func__
-#elif defined _MSC_VER
-#define CV_Func __FUNCTION__
-#else
-#define CV_Func ""
-#endif
+CV_EXPORTS CV_NORETURN void error(int _code, const String& _err, const char* _func, const char* _file, int _line);

 #ifdef CV_STATIC_ANALYSIS
+
 // In practice, some macro are not processed correctly (noreturn is not detected).
 // We need to use simplified definition for them.
 #define CV_Error(...) do { abort(); } while (0)
-#define CV_Error_(...) do { abort(); } while (0)
-#define CV_Assert(cond) do { if (!(cond)) abort(); } while (0)
-#define CV_ErrorNoReturn(...) do { abort(); } while (0)
-#define CV_ErrorNoReturn_(...) do { abort(); } while (0)
+#define CV_Error_( code, args ) do { cv::format args; abort(); } while (0)
+#define CV_Assert( expr ) do { if (!(expr)) abort(); } while (0)

 #else // CV_STATIC_ANALYSIS

@@ -437,7 +325,7 @@ This macro can be used to construct an error message on-fly to include some dyna
 for example:
@code
    // note the extra parentheses around the formatted text message
-    CV_Error_( CV_StsOutOfRange,
+    CV_Error_(Error::StsOutOfRange,
    ("the value at (%d, %d)=%g is out of range", badPt.x, badPt.y, badValue));
@endcode
@param code one of Error::Code
@@ -451,35 +339,39 @@ The macros CV_Assert (and CV_DbgAssert(expr)) evaluate the specified expression.
 raise an error (see cv::error). The macro CV_Assert checks the condition in both Debug and Release
 configurations while CV_DbgAssert is only retained in the Debug configuration.
 */
-
-#define CV_VA_NUM_ARGS_HELPER(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...)    N
-#define CV_VA_NUM_ARGS(...)      CV_VA_NUM_ARGS_HELPER(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
-
-#define CV_Assert_1( expr ) if(!!(expr)) ; else cv::error( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ )
-#define CV_Assert_2( expr1, expr2 ) CV_Assert_1(expr1); CV_Assert_1(expr2)
-#define CV_Assert_3( expr1, expr2, expr3 ) CV_Assert_2(expr1, expr2); CV_Assert_1(expr3)
-#define CV_Assert_4( expr1, expr2, expr3, expr4 ) CV_Assert_3(expr1, expr2, expr3); CV_Assert_1(expr4)
-#define CV_Assert_5( expr1, expr2, expr3, expr4, expr5 ) CV_Assert_4(expr1, expr2, expr3, expr4); CV_Assert_1(expr5)
-#define CV_Assert_6( expr1, expr2, expr3, expr4, expr5, expr6 ) CV_Assert_5(expr1, expr2, expr3, expr4, expr5); CV_Assert_1(expr6)
-#define CV_Assert_7( expr1, expr2, expr3, expr4, expr5, expr6, expr7 ) CV_Assert_6(expr1, expr2, expr3, expr4, expr5, expr6 ); CV_Assert_1(expr7)
-#define CV_Assert_8( expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8 ) CV_Assert_7(expr1, expr2, expr3, expr4, expr5, expr6, expr7 ); CV_Assert_1(expr8)
-#define CV_Assert_9( expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8, expr9 ) CV_Assert_8(expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8 ); CV_Assert_1(expr9)
-#define CV_Assert_10( expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8, expr9, expr10 ) CV_Assert_9(expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8, expr9 ); CV_Assert_1(expr10)
-
-#define CV_Assert(...)               CVAUX_CONCAT(CV_Assert_, CV_VA_NUM_ARGS(__VA_ARGS__)) (__VA_ARGS__)
-
-/** same as CV_Error(code,msg), but does not return */
-#define CV_ErrorNoReturn( code, msg ) cv::errorNoReturn( code, msg, CV_Func, __FILE__, __LINE__ )
-
-/** same as CV_Error_(code,args), but does not return */
-#define CV_ErrorNoReturn_( code, args ) cv::errorNoReturn( code, cv::format args, CV_Func, __FILE__, __LINE__ )
+#define CV_Assert( expr ) do { if(!!(expr)) ; else cv::error( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ ); } while(0)

 #endif // CV_STATIC_ANALYSIS

-/** replaced with CV_Assert(expr) in Debug configuration */
-#ifdef _DEBUG
+//! @cond IGNORED
+#if !defined(__OPENCV_BUILD)  // TODO: backward compatibility only
+#ifndef CV_ErrorNoReturn
+#define CV_ErrorNoReturn CV_Error
+#endif
+#ifndef CV_ErrorNoReturn_
+#define CV_ErrorNoReturn_ CV_Error_
+#endif
+#endif
+
+#define CV_Assert_1 CV_Assert
+#define CV_Assert_2( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_1( __VA_ARGS__ ))
+#define CV_Assert_3( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_2( __VA_ARGS__ ))
+#define CV_Assert_4( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_3( __VA_ARGS__ ))
+#define CV_Assert_5( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_4( __VA_ARGS__ ))
+#define CV_Assert_6( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_5( __VA_ARGS__ ))
+#define CV_Assert_7( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_6( __VA_ARGS__ ))
+#define CV_Assert_8( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_7( __VA_ARGS__ ))
+#define CV_Assert_9( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_8( __VA_ARGS__ ))
+#define CV_Assert_10( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_9( __VA_ARGS__ ))
+
+#define CV_Assert_N(...) do { __CV_EXPAND(__CV_CAT(CV_Assert_, __CV_VA_NUM_ARGS(__VA_ARGS__)) (__VA_ARGS__)); } while(0)
+
+//! @endcond
+
+#if defined _DEBUG || defined CV_STATIC_ANALYSIS
 #  define CV_DbgAssert(expr) CV_Assert(expr)
 #else
+/** replaced with CV_Assert(expr) in Debug configuration */
 #  define CV_DbgAssert(expr)
 #endif

@@ -489,7 +381,7 @@ configurations while CV_DbgAssert is only retained in the Debug configuration.
 */
 struct CV_EXPORTS Hamming
 {
-    enum { normType = NORM_HAMMING };
+    static const NormTypes normType = NORM_HAMMING;
    typedef unsigned char ValueType;
    typedef int ResultType;

@@ -726,11 +618,7 @@ namespace cudev

 namespace ipp
 {
-#if OPENCV_ABI_COMPATIBILITY > 300
 CV_EXPORTS   unsigned long long getIppFeatures();
-#else
-CV_EXPORTS   int getIppFeatures();
-#endif
 CV_EXPORTS   void setIppStatus(int status, const char * const funcname = NULL, const char * const filename = NULL,
                             int line = 0);
 CV_EXPORTS   int getIppStatus();
@@ -740,9 +628,13 @@ CV_EXPORTS_W void   setUseIPP(bool flag);
 CV_EXPORTS_W String getIppVersion();

 // IPP Not-Exact mode. This function may force use of IPP then both IPP and OpenCV provide proper results
-// but have internal accuracy differences which have to much direct or indirect impact on accuracy tests.
-CV_EXPORTS_W bool useIPP_NE();
-CV_EXPORTS_W void setUseIPP_NE(bool flag);
+// but have internal accuracy differences which have too much direct or indirect impact on accuracy tests.
+CV_EXPORTS_W bool useIPP_NotExact();
+CV_EXPORTS_W void setUseIPP_NotExact(bool flag);
+#ifndef DISABLE_OPENCV_3_COMPATIBILITY
+static inline bool useIPP_NE() { return useIPP_NotExact(); }
+static inline void setUseIPP_NE(bool flag) { setUseIPP_NotExact(flag); }
+#endif

 } // ipp

@@ -757,5 +649,6 @@ CV_EXPORTS_W void setUseIPP_NE(bool flag);

 #include "opencv2/core/neon_utils.hpp"
 #include "opencv2/core/vsx_utils.hpp"
+#include "opencv2/core/check.hpp"

 #endif //OPENCV_CORE_BASE_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/bindings_utils.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/bindings_utils.hpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_BINDINGS_UTILS_HPP
+#define OPENCV_CORE_BINDINGS_UTILS_HPP
+
+namespace cv { namespace utils {
+//! @addtogroup core_utils
+//! @{
+
+CV_EXPORTS_W String dumpInputArray(InputArray argument);
+
+CV_EXPORTS_W String dumpInputArrayOfArrays(InputArrayOfArrays argument);
+
+CV_EXPORTS_W String dumpInputOutputArray(InputOutputArray argument);
+
+CV_EXPORTS_W String dumpInputOutputArrayOfArrays(InputOutputArrayOfArrays argument);
+
+//! @}
+}} // namespace
+
+#endif // OPENCV_CORE_BINDINGS_UTILS_HPP
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/bufferpool.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/bufferpool.hpp
--- a/lib/3rdParty/OpenCV/include/opencv2/core/check.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/check.hpp
@@ -0,0 +1,159 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CHECK_HPP
+#define OPENCV_CORE_CHECK_HPP
+
+#include <opencv2/core/base.hpp>
+
+namespace cv {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or "<invalid depth>" */
+CV_EXPORTS const char* depthToString(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or "<invalid type>" */
+CV_EXPORTS const String typeToString(int type);
+
+
+//! @cond IGNORED
+namespace detail {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or NULL */
+CV_EXPORTS const char* depthToString_(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or cv::String() */
+CV_EXPORTS const cv::String typeToString_(int type);
+
+enum TestOp {
+  TEST_CUSTOM = 0,
+  TEST_EQ = 1,
+  TEST_NE = 2,
+  TEST_LE = 3,
+  TEST_LT = 4,
+  TEST_GE = 5,
+  TEST_GT = 6,
+  CV__LAST_TEST_OP
+};
+
+struct CheckContext {
+    const char* func;
+    const char* file;
+    int line;
+    enum TestOp testOp;
+    const char* message;
+    const char* p1_str;
+    const char* p2_str;
+};
+
+#ifndef CV__CHECK_FILENAME
+# define CV__CHECK_FILENAME __FILE__
+#endif
+
+#ifndef CV__CHECK_FUNCTION
+# if defined _MSC_VER
+#   define CV__CHECK_FUNCTION __FUNCSIG__
+# elif defined __GNUC__
+#   define CV__CHECK_FUNCTION __PRETTY_FUNCTION__
+# else
+#   define CV__CHECK_FUNCTION "<unknown>"
+# endif
+#endif
+
+#define CV__CHECK_LOCATION_VARNAME(id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_check_, id), __LINE__)
+#define CV__DEFINE_CHECK_CONTEXT(id, message, testOp, p1_str, p2_str) \
+    static const cv::detail::CheckContext CV__CHECK_LOCATION_VARNAME(id) = \
+            { CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, message, p1_str, p2_str }
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v1, const size_t v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v1, const float v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v1, const double v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v1, const Size_<int> v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx);
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);
+
+
+#define CV__TEST_EQ(v1, v2) ((v1) == (v2))
+#define CV__TEST_NE(v1, v2) ((v1) != (v2))
+#define CV__TEST_LE(v1, v2) ((v1) <= (v2))
+#define CV__TEST_LT(v1, v2) ((v1) < (v2))
+#define CV__TEST_GE(v1, v2) ((v1) >= (v2))
+#define CV__TEST_GT(v1, v2) ((v1) > (v2))
+
+#define CV__CHECK(id, op, type, v1, v2, v1_str, v2_str, msg_str) do { \
+    if(CV__TEST_##op((v1), (v2))) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_ ## op, v1_str, v2_str); \
+        cv::detail::check_failed_ ## type((v1), (v2), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+#define CV__CHECK_CUSTOM_TEST(id, type, v, test_expr, v_str, test_expr_str, msg_str) do { \
+    if(!!(test_expr)) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_CUSTOM, v_str, test_expr_str); \
+        cv::detail::check_failed_ ## type((v), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+} // namespace
+//! @endcond
+
+
+/// Supported values of these types: int, float, double
+#define CV_CheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+
+/// Check with additional "decoding" of type values in error message
+#define CV_CheckTypeEQ(t1, t2, msg)  CV__CHECK(_, EQ, MatType, t1, t2, #t1, #t2, msg)
+/// Check with additional "decoding" of depth values in error message
+#define CV_CheckDepthEQ(d1, d2, msg)  CV__CHECK(_, EQ, MatDepth, d1, d2, #d1, #d2, msg)
+
+#define CV_CheckChannelsEQ(c1, c2, msg)  CV__CHECK(_, EQ, MatChannels, c1, c2, #c1, #c2, msg)
+
+/// Example: type == CV_8UC1 || type == CV_8UC3
+#define CV_CheckType(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatType, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: depth == CV_32F || depth == CV_64F
+#define CV_CheckDepth(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatDepth, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: v == A || v == B
+#define CV_Check(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+
+/// Some complex conditions: CV_Check(src2, src2.empty() || (src2.type() == src1.type() && src2.size() == src1.size()), "src2 should have same size/type as src1")
+// TODO define pretty-printers
+
+#ifndef NDEBUG
+#define CV_DbgCheck(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+#define CV_DbgCheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+#else
+#define CV_DbgCheck(v, test_expr, msg)  do { } while (0)
+#define CV_DbgCheckEQ(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckNE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckLE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckLT(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckGE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckGT(v1, v2, msg)  do { } while (0)
+#endif
+
+} // namespace
+
+#endif // OPENCV_CORE_CHECK_HPP
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/core.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/core.hpp
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/core_c.h
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/core_c.h
@@ -1576,8 +1576,8 @@ CVAPI(void)  cvRestoreMemStoragePos( CvMemStorage* storage, CvMemStoragePos* pos
 CVAPI(void*) cvMemStorageAlloc( CvMemStorage* storage, size_t size );

 /** Allocates string in memory storage */
-CVAPI(CvString) cvMemStorageAllocString( CvMemStorage* storage, const char* ptr,
-                                         int len CV_DEFAULT(-1) );
+//CVAPI(CvString) cvMemStorageAllocString( CvMemStorage* storage, const char* ptr,
+//                                         int len CV_DEFAULT(-1) );

 /** Creates new empty sequence that will reside in the specified storage */
 CVAPI(CvSeq*)  cvCreateSeq( int seq_flags, size_t header_size,
@@ -1788,7 +1788,7 @@ CVAPI(int)  cvGraphRemoveVtx( CvGraph* graph, int index );
 CVAPI(int)  cvGraphRemoveVtxByPtr( CvGraph* graph, CvGraphVtx* vtx );


-/** Link two vertices specifed by indices or pointers if they
+/** Link two vertices specified by indices or pointers if they
   are not connected or return pointer to already existing edge
   connecting the vertices.
   Functions return 1 if a new edge was created, 0 otherwise */
@@ -1970,6 +1970,7 @@ CVAPI(void) cvSetIPLAllocators( Cv_iplCreateImageHeader create_header,
 *                                    Data Persistence                                    *
 \****************************************************************************************/

+#if 0
 /********************************** High-level functions ********************************/

 /** @brief Opens file storage for reading or writing data.
@@ -1978,11 +1979,7 @@ The function opens file storage for reading or writing data. In the latter case,
 created or an existing file is rewritten. The type of the read or written file is determined by the
 filename extension: .xml for XML, .yml or .yaml for YAML and .json for JSON.

-At the same time, it also supports adding parameters like "example.xml?base64". The three ways
-are the same:
-@snippet samples/cpp/filestorage_base64.cpp suffix_in_file_name
-@snippet samples/cpp/filestorage_base64.cpp flag_write_base64
-@snippet samples/cpp/filestorage_base64.cpp flag_write_and_flag_base64
+At the same time, it also supports adding parameters like "example.xml?base64".

 The function returns a pointer to the CvFileStorage structure.
 If the file cannot be opened then the function returns NULL.
@@ -2206,11 +2203,6 @@ in plain text.

 This function can only be used to write a sequence with a type "binary".

-Consider the following two examples where their output is the same:
-@snippet samples/cpp/filestorage_base64.cpp without_base64_flag
-and
-@snippet samples/cpp/filestorage_base64.cpp with_write_base64_flag
-
@param fs File storage
@param src Pointer to the written array
@param len Number of the array elements to write
@@ -2565,10 +2557,12 @@ returns NULL.
 */
 CVAPI(CvTypeInfo*) cvTypeOf( const void* struct_ptr );

+#endif
+
 /** @brief Releases an object.

-The function finds the type of a given object and calls release with the double pointer.
-@param struct_ptr Double pointer to the object
+ The function finds the type of a given object and calls release with the double pointer.
+ @param struct_ptr Double pointer to the object
 */
 CVAPI(void) cvRelease( void** struct_ptr );

@@ -2581,41 +2575,6 @@ function, like cvCloneMat.
 */
 CVAPI(void*) cvClone( const void* struct_ptr );

-/** @brief Saves an object to a file.
-
-The function saves an object to a file. It provides a simple interface to cvWrite .
-@param filename File name
-@param struct_ptr Object to save
-@param name Optional object name. If it is NULL, the name will be formed from filename .
-@param comment Optional comment to put in the beginning of the file
-@param attributes Optional attributes passed to cvWrite
- */
-CVAPI(void) cvSave( const char* filename, const void* struct_ptr,
-                    const char* name CV_DEFAULT(NULL),
-                    const char* comment CV_DEFAULT(NULL),
-                    CvAttrList attributes CV_DEFAULT(cvAttrList()));
-
-/** @brief Loads an object from a file.
-
-The function loads an object from a file. It basically reads the specified file, find the first
-top-level node and calls cvRead for that node. If the file node does not have type information or
-the type information can not be found by the type name, the function returns NULL. After the object
-is loaded, the file storage is closed and all the temporary buffers are deleted. Thus, to load a
-dynamic structure, such as a sequence, contour, or graph, one should pass a valid memory storage
-destination to the function.
-@param filename File name
-@param memstorage Memory storage for dynamic structures, such as CvSeq or CvGraph . It is not used
-for matrices or images.
-@param name Optional object name. If it is NULL, the first top-level object in the storage will be
-loaded.
-@param real_name Optional output parameter that will contain the name of the loaded object
-(useful if name=NULL )
- */
-CVAPI(void*) cvLoad( const char* filename,
-                     CvMemStorage* memstorage CV_DEFAULT(NULL),
-                     const char* name CV_DEFAULT(NULL),
-                     const char** real_name CV_DEFAULT(NULL) );
-
 /*********************************** Measuring Execution Time ***************************/

 /** helper functions for RNG initialization and accurate time measurement:
@@ -2648,7 +2607,7 @@ CVAPI(void) cvSetErrStatus( int status );
 #define CV_ErrModeParent   1   /* Print error and continue */
 #define CV_ErrModeSilent   2   /* Don't print and continue */

-/** Retrives current error processing mode */
+/** Retrieves current error processing mode */
 CVAPI(int)  cvGetErrMode( void );

 /** Sets error processing mode, returns previously used mode */
@@ -2738,7 +2697,7 @@ static char cvFuncName[] = Name
 /**
 CV_CALL macro calls CV (or IPL) function, checks error status and
 signals a error if the function failed. Useful in "parent node"
- error procesing mode
+ error processing mode
 */
 #define CV_CALL( Func )                                             \
 {                                                                   \
@@ -2766,24 +2725,6 @@ static char cvFuncName[] = Name

 #ifdef __cplusplus

-//! @addtogroup core_c_glue
-//! @{
-
-//! class for automatic module/RTTI data registration/unregistration
-struct CV_EXPORTS CvType
-{
-    CvType( const char* type_name,
-            CvIsInstanceFunc is_instance, CvReleaseFunc release=0,
-            CvReadFunc read=0, CvWriteFunc write=0, CvCloneFunc clone=0 );
-    ~CvType();
-    CvTypeInfo* info;
-
-    static CvTypeInfo* first;
-    static CvTypeInfo* last;
-};
-
-//! @}
-
 #include "opencv2/core/utility.hpp"

 namespace cv
@@ -2814,11 +2755,11 @@ CV_EXPORTS void insertImageCOI(InputArray coiimg, CvArr* arr, int coi=-1);

 ////// specialized implementations of DefaultDeleter::operator() for classic OpenCV types //////

-template<> CV_EXPORTS void DefaultDeleter<CvMat>::operator ()(CvMat* obj) const;
-template<> CV_EXPORTS void DefaultDeleter<IplImage>::operator ()(IplImage* obj) const;
-template<> CV_EXPORTS void DefaultDeleter<CvMatND>::operator ()(CvMatND* obj) const;
-template<> CV_EXPORTS void DefaultDeleter<CvSparseMat>::operator ()(CvSparseMat* obj) const;
-template<> CV_EXPORTS void DefaultDeleter<CvMemStorage>::operator ()(CvMemStorage* obj) const;
+template<> struct DefaultDeleter<CvMat>{ CV_EXPORTS void operator ()(CvMat* obj) const; };
+template<> struct DefaultDeleter<IplImage>{ CV_EXPORTS void operator ()(IplImage* obj) const; };
+template<> struct DefaultDeleter<CvMatND>{ CV_EXPORTS void operator ()(CvMatND* obj) const; };
+template<> struct DefaultDeleter<CvSparseMat>{ CV_EXPORTS void operator ()(CvSparseMat* obj) const; };
+template<> struct DefaultDeleter<CvMemStorage>{ CV_EXPORTS void operator ()(CvMemStorage* obj) const; };

 ////////////// convenient wrappers for operating old-style dynamic structures //////////////

@@ -3073,7 +3014,7 @@ template<typename _Tp> inline void Seq<_Tp>::copyTo(std::vector<_Tp>& vec, const
    size_t len = !seq ? 0 : range == Range::all() ? seq->total : range.end - range.start;
    vec.resize(len);
    if( seq && len )
-        cvCvtSeqToArray(seq, &vec[0], range);
+        cvCvtSeqToArray(seq, &vec[0], cvSlice(range));
 }

 template<typename _Tp> inline Seq<_Tp>::operator std::vector<_Tp>() const
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/cuda.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/cuda.hpp
@@ -56,7 +56,7 @@
  @{
    @defgroup cudacore Core part
    @{
-      @defgroup cudacore_init Initalization and Information
+      @defgroup cudacore_init Initialization and Information
      @defgroup cudacore_struct Data Structures
    @}
  @}
@@ -91,12 +91,21 @@ aligned to a size depending on the hardware. Single-row GpuMat is always a conti
 on its destructor. The destruction order of such variables and CUDA context is undefined. GPU memory
 release function returns error if the CUDA context has been destroyed before.

+Some member functions are described as a "Blocking Call" while some are described as a
+"Non-Blocking Call". Blocking functions are synchronous to host. It is guaranteed that the GPU
+operation is finished when the function returns. However, non-blocking functions are asynchronous to
+host. Those functions may return even if the GPU operation is not finished.
+
+Compared to their blocking counterpart, non-blocking functions accept Stream as an additional
+argument. If a non-default stream is passed, the GPU operation may overlap with operations in other
+streams.
+
@sa Mat
 */
-class CV_EXPORTS GpuMat
+class CV_EXPORTS_W GpuMat
 {
 public:
-    class CV_EXPORTS Allocator
+    class CV_EXPORTS_W Allocator
    {
    public:
        virtual ~Allocator() {}
@@ -107,33 +116,33 @@ public:
    };

    //! default allocator
-    static Allocator* defaultAllocator();
-    static void setDefaultAllocator(Allocator* allocator);
+    CV_WRAP static GpuMat::Allocator* defaultAllocator();
+    CV_WRAP static void setDefaultAllocator(GpuMat::Allocator* allocator);

    //! default constructor
-    explicit GpuMat(Allocator* allocator = defaultAllocator());
+    CV_WRAP explicit GpuMat(GpuMat::Allocator* allocator = GpuMat::defaultAllocator());

    //! constructs GpuMat of the specified size and type
-    GpuMat(int rows, int cols, int type, Allocator* allocator = defaultAllocator());
-    GpuMat(Size size, int type, Allocator* allocator = defaultAllocator());
+    CV_WRAP GpuMat(int rows, int cols, int type, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+    CV_WRAP GpuMat(Size size, int type, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());

    //! constucts GpuMat and fills it with the specified value _s
-    GpuMat(int rows, int cols, int type, Scalar s, Allocator* allocator = defaultAllocator());
-    GpuMat(Size size, int type, Scalar s, Allocator* allocator = defaultAllocator());
+    CV_WRAP GpuMat(int rows, int cols, int type, Scalar s, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+    CV_WRAP GpuMat(Size size, int type, Scalar s, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());

    //! copy constructor
-    GpuMat(const GpuMat& m);
+    CV_WRAP GpuMat(const GpuMat& m);

    //! constructor for GpuMat headers pointing to user-allocated data
    GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
    GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);

    //! creates a GpuMat header for a part of the bigger matrix
-    GpuMat(const GpuMat& m, Range rowRange, Range colRange);
-    GpuMat(const GpuMat& m, Rect roi);
+    CV_WRAP GpuMat(const GpuMat& m, Range rowRange, Range colRange);
+    CV_WRAP GpuMat(const GpuMat& m, Rect roi);

    //! builds GpuMat from host memory (Blocking call)
-    explicit GpuMat(InputArray arr, Allocator* allocator = defaultAllocator());
+    CV_WRAP explicit GpuMat(InputArray arr, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());

    //! destructor - calls release()
    ~GpuMat();
@@ -142,70 +151,92 @@ public:
    GpuMat& operator =(const GpuMat& m);

    //! allocates new GpuMat data unless the GpuMat already has specified size and type
-    void create(int rows, int cols, int type);
-    void create(Size size, int type);
+    CV_WRAP void create(int rows, int cols, int type);
+    CV_WRAP void create(Size size, int type);

    //! decreases reference counter, deallocate the data when reference counter reaches 0
    void release();

    //! swaps with other smart pointer
-    void swap(GpuMat& mat);
+    CV_WRAP void swap(GpuMat& mat);

-    //! pefroms upload data to GpuMat (Blocking call)
-    void upload(InputArray arr);
+    /** @brief Performs data upload to GpuMat (Blocking call)

-    //! pefroms upload data to GpuMat (Non-Blocking call)
-    void upload(InputArray arr, Stream& stream);
+    This function copies data from host memory to device memory. As being a blocking call, it is
+    guaranteed that the copy operation is finished when this function returns.
+    */
+    CV_WRAP void upload(InputArray arr);

-    //! pefroms download data from device to host memory (Blocking call)
-    void download(OutputArray dst) const;
+    /** @brief Performs data upload to GpuMat (Non-Blocking call)

-    //! pefroms download data from device to host memory (Non-Blocking call)
-    void download(OutputArray dst, Stream& stream) const;
+    This function copies data from host memory to device memory. As being a non-blocking call, this
+    function may return even if the copy operation is not finished.
+
+    The copy operation may be overlapped with operations in other non-default streams if \p stream is
+    not the default stream and \p dst is HostMem allocated with HostMem::PAGE_LOCKED option.
+    */
+    CV_WRAP void upload(InputArray arr, Stream& stream);
+
+    /** @brief Performs data download from GpuMat (Blocking call)
+
+    This function copies data from device memory to host memory. As being a blocking call, it is
+    guaranteed that the copy operation is finished when this function returns.
+    */
+    CV_WRAP void download(OutputArray dst) const;
+
+    /** @brief Performs data download from GpuMat (Non-Blocking call)
+
+    This function copies data from device memory to host memory. As being a non-blocking call, this
+    function may return even if the copy operation is not finished.
+
+    The copy operation may be overlapped with operations in other non-default streams if \p stream is
+    not the default stream and \p dst is HostMem allocated with HostMem::PAGE_LOCKED option.
+    */
+    CV_WRAP void download(OutputArray dst, Stream& stream) const;

    //! returns deep copy of the GpuMat, i.e. the data is copied
-    GpuMat clone() const;
+    CV_WRAP GpuMat clone() const;

    //! copies the GpuMat content to device memory (Blocking call)
-    void copyTo(OutputArray dst) const;
+    CV_WRAP void copyTo(OutputArray dst) const;

    //! copies the GpuMat content to device memory (Non-Blocking call)
-    void copyTo(OutputArray dst, Stream& stream) const;
+    CV_WRAP void copyTo(OutputArray dst, Stream& stream) const;

    //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
-    void copyTo(OutputArray dst, InputArray mask) const;
+    CV_WRAP void copyTo(OutputArray dst, InputArray mask) const;

    //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
-    void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+    CV_WRAP void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;

    //! sets some of the GpuMat elements to s (Blocking call)
-    GpuMat& setTo(Scalar s);
+    CV_WRAP GpuMat& setTo(Scalar s);

    //! sets some of the GpuMat elements to s (Non-Blocking call)
-    GpuMat& setTo(Scalar s, Stream& stream);
+    CV_WRAP GpuMat& setTo(Scalar s, Stream& stream);

    //! sets some of the GpuMat elements to s, according to the mask (Blocking call)
-    GpuMat& setTo(Scalar s, InputArray mask);
+    CV_WRAP GpuMat& setTo(Scalar s, InputArray mask);

    //! sets some of the GpuMat elements to s, according to the mask (Non-Blocking call)
-    GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
+    CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);

    //! converts GpuMat to another datatype (Blocking call)
-    void convertTo(OutputArray dst, int rtype) const;
+    CV_WRAP void convertTo(OutputArray dst, int rtype) const;

    //! converts GpuMat to another datatype (Non-Blocking call)
-    void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+    CV_WRAP void convertTo(OutputArray dst, int rtype, Stream& stream) const;

    //! converts GpuMat to another datatype with scaling (Blocking call)
-    void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;

    //! converts GpuMat to another datatype with scaling (Non-Blocking call)
-    void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
+    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;

    //! converts GpuMat to another datatype with scaling (Non-Blocking call)
-    void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;

-    void assignTo(GpuMat& m, int type=-1) const;
+    CV_WRAP void assignTo(GpuMat& m, int type = -1) const;

    //! returns pointer to y-th row
    uchar* ptr(int y = 0);
@@ -219,18 +250,18 @@ public:
    template <typename _Tp> operator PtrStep<_Tp>() const;

    //! returns a new GpuMat header for the specified row
-    GpuMat row(int y) const;
+    CV_WRAP GpuMat row(int y) const;

    //! returns a new GpuMat header for the specified column
-    GpuMat col(int x) const;
+    CV_WRAP GpuMat col(int x) const;

    //! ... for the specified row span
-    GpuMat rowRange(int startrow, int endrow) const;
-    GpuMat rowRange(Range r) const;
+    CV_WRAP GpuMat rowRange(int startrow, int endrow) const;
+    CV_WRAP GpuMat rowRange(Range r) const;

    //! ... for the specified column span
-    GpuMat colRange(int startcol, int endcol) const;
-    GpuMat colRange(Range r) const;
+    CV_WRAP GpuMat colRange(int startcol, int endcol) const;
+    CV_WRAP GpuMat colRange(Range r) const;

    //! extracts a rectangular sub-GpuMat (this is a generalized form of row, rowRange etc.)
    GpuMat operator ()(Range rowRange, Range colRange) const;
@@ -238,41 +269,44 @@ public:

    //! creates alternative GpuMat header for the same data, with different
    //! number of channels and/or different number of rows
-    GpuMat reshape(int cn, int rows = 0) const;
+    CV_WRAP GpuMat reshape(int cn, int rows = 0) const;

    //! locates GpuMat header within a parent GpuMat
-    void locateROI(Size& wholeSize, Point& ofs) const;
+    CV_WRAP void locateROI(Size& wholeSize, Point& ofs) const;

    //! moves/resizes the current GpuMat ROI inside the parent GpuMat
-    GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
+    CV_WRAP GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);

    //! returns true iff the GpuMat data is continuous
    //! (i.e. when there are no gaps between successive rows)
-    bool isContinuous() const;
+    CV_WRAP bool isContinuous() const;

    //! returns element size in bytes
-    size_t elemSize() const;
+    CV_WRAP size_t elemSize() const;

    //! returns the size of element channel in bytes
-    size_t elemSize1() const;
+    CV_WRAP size_t elemSize1() const;

    //! returns element type
-    int type() const;
+    CV_WRAP int type() const;

    //! returns element type
-    int depth() const;
+    CV_WRAP int depth() const;

    //! returns number of channels
-    int channels() const;
+    CV_WRAP int channels() const;

    //! returns step/elemSize1()
-    size_t step1() const;
+    CV_WRAP size_t step1() const;

    //! returns GpuMat size : width == number of columns, height == number of rows
-    Size size() const;
+    CV_WRAP Size size() const;

    //! returns true if GpuMat data is NULL
-    bool empty() const;
+    CV_WRAP bool empty() const;
+
+    //! internal use method: updates the continuity flag
+    CV_WRAP void updateContinuityFlag();

    /*! includes several bit-fields:
    - the magic signature
@@ -286,7 +320,7 @@ public:
    int rows, cols;

    //! a distance between successive rows in bytes; includes the gap if any
-    size_t step;
+    CV_PROP size_t step;

    //! pointer to the data
    uchar* data;
@@ -314,7 +348,7 @@ public:
 Matrix is called continuous if its elements are stored continuously, that is, without gaps at the
 end of each row.
 */
-CV_EXPORTS void createContinuous(int rows, int cols, int type, OutputArray arr);
+CV_EXPORTS_W void createContinuous(int rows, int cols, int type, OutputArray arr);

 /** @brief Ensures that the size of a matrix is big enough and the matrix has a proper type.

@@ -325,17 +359,126 @@ CV_EXPORTS void createContinuous(int rows, int cols, int type, OutputArray arr);

 The function does not reallocate memory if the matrix has proper attributes already.
 */
-CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
+CV_EXPORTS_W void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);

 /** @brief BufferPool for use with CUDA streams

- * BufferPool utilizes cuda::Stream's allocator to create new buffers. It is
- * particularly useful when BufferPoolUsage is set to true, or a custom
- * allocator is specified for the cuda::Stream, and you want to implement your
- * own stream based functions utilizing the same underlying GPU memory
- * management.
+BufferPool utilizes Stream's allocator to create new buffers for GpuMat's. It is
+only useful when enabled with #setBufferPoolUsage.
+
+@code
+    setBufferPoolUsage(true);
+@endcode
+
+@note #setBufferPoolUsage must be called \em before any Stream declaration.
+
+Users may specify custom allocator for Stream and may implement their own stream based
+functions utilizing the same underlying GPU memory management.
+
+If custom allocator is not specified, BufferPool utilizes StackAllocator by
+default. StackAllocator allocates a chunk of GPU device memory beforehand,
+and when GpuMat is declared later on, it is given the pre-allocated memory.
+This kind of strategy reduces the number of calls for memory allocating APIs
+such as cudaMalloc or cudaMallocPitch.
+
+Below is an example that utilizes BufferPool with StackAllocator:
+
+@code
+    #include <opencv2/opencv.hpp>
+
+    using namespace cv;
+    using namespace cv::cuda
+
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2);  // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks)
+
+        Stream stream1, stream2;                                // Each stream uses 1 stack
+        BufferPool pool1(stream1), pool2(stream2);
+
+        GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1);   // 16MB
+        GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3);   // 48MB, pool1 is now full
+
+        GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1);   // 1MB
+        GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3);   // 3MB
+
+        cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1);
+        cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2);
+    }
+@endcode
+
+If we allocate another GpuMat on pool1 in the above example, it will be carried out by
+the DefaultAllocator since the stack for pool1 is full.
+
+@code
+    GpuMat d_add1 = pool1.getBuffer(1024, 1024, CV_8UC1);   // Stack for pool1 is full, memory is allocated with DefaultAllocator
+@endcode
+
+If a third stream is declared in the above example, allocating with #getBuffer
+within that stream will also be carried out by the DefaultAllocator because we've run out of
+stacks.
+
+@code
+    Stream stream3;                                         // Only 2 stacks were allocated, we've run out of stacks
+    BufferPool pool3(stream3);
+    GpuMat d_src3 = pool3.getBuffer(1024, 1024, CV_8UC1);   // Memory is allocated with DefaultAllocator
+@endcode
+
+@warning When utilizing StackAllocator, deallocation order is important.
+
+Just like a stack, deallocation must be done in LIFO order. Below is an example of
+erroneous usage that violates LIFO rule. If OpenCV is compiled in Debug mode, this
+sample code will emit CV_Assert error.
+
+@code
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        Stream stream;                                          // A default size (10 MB) stack is allocated to this stream
+        BufferPool pool(stream);
+
+        GpuMat mat1 = pool.getBuffer(1024, 1024, CV_8UC1);      // Allocate mat1 (1MB)
+        GpuMat mat2 = pool.getBuffer(1024, 1024, CV_8UC1);      // Allocate mat2 (1MB)
+
+        mat1.release();                                         // erroneous usage : mat2 must be deallocated before mat1
+    }
+@endcode
+
+Since C++ local variables are destroyed in the reverse order of construction,
+the code sample below satisfies the LIFO rule. Local GpuMat's are deallocated
+and the corresponding memory is automatically returned to the pool for later usage.
+
+@code
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2);  // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks)
+
+        Stream stream1, stream2;                                // Each stream uses 1 stack
+        BufferPool pool1(stream1), pool2(stream2);
+
+        for (int i = 0; i < 10; i++)
+        {
+            GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1);   // 16MB
+            GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3);   // 48MB, pool1 is now full
+
+            GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1);   // 1MB
+            GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3);   // 3MB
+
+            d_src1.setTo(Scalar(i), stream1);
+            d_src2.setTo(Scalar(i), stream2);
+
+            cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1);
+            cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2);
+                                                                    // The order of destruction of the local variables is:
+                                                                    //   d_dst2 => d_src2 => d_dst1 => d_src1
+                                                                    // LIFO rule is satisfied, this code runs without error
+        }
+    }
+@endcode
 */
-class CV_EXPORTS BufferPool
+class CV_EXPORTS_W BufferPool
 {
 public:

@@ -343,21 +486,21 @@ public:
    explicit BufferPool(Stream& stream);

    //! Allocates a new GpuMat of given size and type.
-    GpuMat getBuffer(int rows, int cols, int type);
+    CV_WRAP GpuMat getBuffer(int rows, int cols, int type);

    //! Allocates a new GpuMat of given size and type.
-    GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
+    CV_WRAP GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }

    //! Returns the allocator associated with the stream.
-    Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; }
+    CV_WRAP Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; }

 private:
    Ptr<GpuMat::Allocator> allocator_;
 };

 //! BufferPool management (must be called before Stream creation)
-CV_EXPORTS void setBufferPoolUsage(bool on);
-CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
+CV_EXPORTS_W void setBufferPoolUsage(bool on);
+CV_EXPORTS_W void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);

 //===================================================================================
 // HostMem
@@ -378,46 +521,46 @@ Its interface is also Mat-like but with additional memory type parameters.
@note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2
 Pinned Memory APIs* document or *CUDA C Programming Guide*.
 */
-class CV_EXPORTS HostMem
+class CV_EXPORTS_W HostMem
 {
 public:
    enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };

-    static MatAllocator* getAllocator(AllocType alloc_type = PAGE_LOCKED);
+    static MatAllocator* getAllocator(HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);

-    explicit HostMem(AllocType alloc_type = PAGE_LOCKED);
+    CV_WRAP explicit HostMem(HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);

    HostMem(const HostMem& m);

-    HostMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
-    HostMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
+    CV_WRAP HostMem(int rows, int cols, int type, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+    CV_WRAP HostMem(Size size, int type, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);

    //! creates from host memory with coping data
-    explicit HostMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
+    CV_WRAP explicit HostMem(InputArray arr, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);

    ~HostMem();

    HostMem& operator =(const HostMem& m);

    //! swaps with other smart pointer
-    void swap(HostMem& b);
+    CV_WRAP void swap(HostMem& b);

    //! returns deep copy of the matrix, i.e. the data is copied
-    HostMem clone() const;
+    CV_WRAP HostMem clone() const;

    //! allocates new matrix data unless the matrix already has specified size and type.
-    void create(int rows, int cols, int type);
+    CV_WRAP void create(int rows, int cols, int type);
    void create(Size size, int type);

    //! creates alternative HostMem header for the same data, with different
    //! number of channels and/or different number of rows
-    HostMem reshape(int cn, int rows = 0) const;
+    CV_WRAP HostMem reshape(int cn, int rows = 0) const;

    //! decrements reference counter and released memory if needed.
    void release();

    //! returns matrix header with disabled reference counting for HostMem data.
-    Mat createMatHeader() const;
+    CV_WRAP Mat createMatHeader() const;

    /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting
    for it.
@@ -429,20 +572,20 @@ public:
    GpuMat createGpuMatHeader() const;

    // Please see cv::Mat for descriptions
-    bool isContinuous() const;
-    size_t elemSize() const;
-    size_t elemSize1() const;
-    int type() const;
-    int depth() const;
-    int channels() const;
-    size_t step1() const;
-    Size size() const;
-    bool empty() const;
+    CV_WRAP bool isContinuous() const;
+    CV_WRAP size_t elemSize() const;
+    CV_WRAP size_t elemSize1() const;
+    CV_WRAP int type() const;
+    CV_WRAP int depth() const;
+    CV_WRAP int channels() const;
+    CV_WRAP size_t step1() const;
+    CV_WRAP Size size() const;
+    CV_WRAP bool empty() const;

    // Please see cv::Mat for descriptions
    int flags;
    int rows, cols;
-    size_t step;
+    CV_PROP size_t step;

    uchar* data;
    int* refcount;
@@ -457,13 +600,13 @@ public:

@param m Input matrix.
 */
-CV_EXPORTS void registerPageLocked(Mat& m);
+CV_EXPORTS_W void registerPageLocked(Mat& m);

 /** @brief Unmaps the memory of matrix and makes it pageable again.

@param m Input matrix.
 */
-CV_EXPORTS void unregisterPageLocked(Mat& m);
+CV_EXPORTS_W void unregisterPageLocked(Mat& m);

 //===================================================================================
 // Stream
@@ -496,7 +639,7 @@ void thread2()
@note By default all CUDA routines are launched in Stream::Null() object, if the stream is not specified by user.
 In multi-threading environment the stream objects must be passed explicitly (see previous note).
 */
-class CV_EXPORTS Stream
+class CV_EXPORTS_W Stream
 {
    typedef void (Stream::*bool_type)() const;
    void this_type_does_not_support_comparisons() const {}
@@ -505,22 +648,22 @@ public:
    typedef void (*StreamCallback)(int status, void* userData);

    //! creates a new asynchronous stream
-    Stream();
+    CV_WRAP Stream();

    //! creates a new asynchronous stream with custom allocator
-    Stream(const Ptr<GpuMat::Allocator>& allocator);
+    CV_WRAP Stream(const Ptr<GpuMat::Allocator>& allocator);

    /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
    */
-    bool queryIfComplete() const;
+    CV_WRAP bool queryIfComplete() const;

    /** @brief Blocks the current CPU thread until all operations in the stream are complete.
    */
-    void waitForCompletion();
+    CV_WRAP void waitForCompletion();

    /** @brief Makes a compute stream wait on an event.
    */
-    void waitEvent(const Event& event);
+    CV_WRAP void waitEvent(const Event& event);

    /** @brief Adds a callback to be called on the host after all currently enqueued items in the stream have
    completed.
@@ -533,7 +676,7 @@ public:
    void enqueueHostCallback(StreamCallback callback, void* userData);

    //! return Stream object for default CUDA stream
-    static Stream& Null();
+    CV_WRAP static Stream& Null();

    //! returns true if stream object is not default (!= 0)
    operator bool_type() const;
@@ -549,7 +692,7 @@ private:
    friend class DefaultDeviceInitializer;
 };

-class CV_EXPORTS Event
+class CV_EXPORTS_W Event
 {
 public:
    enum CreateFlags
@@ -560,19 +703,19 @@ public:
        INTERPROCESS   = 0x04   /**< Event is suitable for interprocess use. DisableTiming must be set */
    };

-    explicit Event(CreateFlags flags = DEFAULT);
+    CV_WRAP explicit Event(Event::CreateFlags flags = Event::CreateFlags::DEFAULT);

    //! records an event
-    void record(Stream& stream = Stream::Null());
+    CV_WRAP void record(Stream& stream = Stream::Null());

    //! queries an event's status
-    bool queryIfComplete() const;
+    CV_WRAP bool queryIfComplete() const;

    //! waits for an event to complete
-    void waitForCompletion();
+    CV_WRAP void waitForCompletion();

    //! computes the elapsed time between events
-    static float elapsedTime(const Event& start, const Event& end);
+    CV_WRAP static float elapsedTime(const Event& start, const Event& end);

    class Impl;

@@ -598,7 +741,7 @@ Use this function before any other CUDA functions calls. If OpenCV is compiled w
 this function returns 0. If the CUDA driver is not installed, or is incompatible, this function
 returns -1.
 */
-CV_EXPORTS int getCudaEnabledDeviceCount();
+CV_EXPORTS_W int getCudaEnabledDeviceCount();

 /** @brief Sets a device and initializes it for the current thread.

@@ -606,18 +749,18 @@ CV_EXPORTS int getCudaEnabledDeviceCount();

 If the call of this function is omitted, a default device is initialized at the fist CUDA usage.
 */
-CV_EXPORTS void setDevice(int device);
+CV_EXPORTS_W void setDevice(int device);

 /** @brief Returns the current device index set by cuda::setDevice or initialized by default.
 */
-CV_EXPORTS int getDevice();
+CV_EXPORTS_W int getDevice();

 /** @brief Explicitly destroys and cleans up all resources associated with the current device in the current
 process.

 Any subsequent API call to this device will reinitialize the device.
 */
-CV_EXPORTS void resetDevice();
+CV_EXPORTS_W void resetDevice();

 /** @brief Enumeration providing CUDA computing features.
 */
@@ -650,7 +793,7 @@ built for.
 According to the CUDA C Programming Guide Version 3.2: "PTX code produced for some specific compute
 capability can always be compiled to binary code of greater or equal compute capability".
 */
-class CV_EXPORTS TargetArchs
+class CV_EXPORTS_W TargetArchs
 {
 public:
    /** @brief The following method checks whether the module was built with the support of the given feature:
@@ -665,23 +808,23 @@ public:
    @param major Major compute capability version.
    @param minor Minor compute capability version.
     */
-    static bool has(int major, int minor);
-    static bool hasPtx(int major, int minor);
-    static bool hasBin(int major, int minor);
+    CV_WRAP static bool has(int major, int minor);
+    CV_WRAP static bool hasPtx(int major, int minor);
+    CV_WRAP static bool hasBin(int major, int minor);

-    static bool hasEqualOrLessPtx(int major, int minor);
-    static bool hasEqualOrGreater(int major, int minor);
-    static bool hasEqualOrGreaterPtx(int major, int minor);
-    static bool hasEqualOrGreaterBin(int major, int minor);
+    CV_WRAP static bool hasEqualOrLessPtx(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreater(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreaterPtx(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreaterBin(int major, int minor);
 };

 /** @brief Class providing functionality for querying the specified GPU properties.
 */
-class CV_EXPORTS DeviceInfo
+class CV_EXPORTS_W DeviceInfo
 {
 public:
    //! creates DeviceInfo object for the current GPU
-    DeviceInfo();
+    CV_WRAP DeviceInfo();

    /** @brief The constructors.

@@ -690,68 +833,68 @@ public:
    Constructs the DeviceInfo object for the specified device. If device_id parameter is missed, it
    constructs an object for the current device.
     */
-    DeviceInfo(int device_id);
+    CV_WRAP DeviceInfo(int device_id);

    /** @brief Returns system index of the CUDA device starting with 0.
    */
-    int deviceID() const;
+    CV_WRAP int deviceID() const;

    //! ASCII string identifying device
    const char* name() const;

    //! global memory available on device in bytes
-    size_t totalGlobalMem() const;
+    CV_WRAP size_t totalGlobalMem() const;

    //! shared memory available per block in bytes
-    size_t sharedMemPerBlock() const;
+    CV_WRAP size_t sharedMemPerBlock() const;

    //! 32-bit registers available per block
-    int regsPerBlock() const;
+    CV_WRAP int regsPerBlock() const;

    //! warp size in threads
-    int warpSize() const;
+    CV_WRAP int warpSize() const;

    //! maximum pitch in bytes allowed by memory copies
-    size_t memPitch() const;
+    CV_WRAP size_t memPitch() const;

    //! maximum number of threads per block
-    int maxThreadsPerBlock() const;
+    CV_WRAP int maxThreadsPerBlock() const;

    //! maximum size of each dimension of a block
-    Vec3i maxThreadsDim() const;
+    CV_WRAP Vec3i maxThreadsDim() const;

    //! maximum size of each dimension of a grid
-    Vec3i maxGridSize() const;
+    CV_WRAP Vec3i maxGridSize() const;

    //! clock frequency in kilohertz
-    int clockRate() const;
+    CV_WRAP int clockRate() const;

    //! constant memory available on device in bytes
-    size_t totalConstMem() const;
+    CV_WRAP size_t totalConstMem() const;

    //! major compute capability
-    int majorVersion() const;
+    CV_WRAP int majorVersion() const;

    //! minor compute capability
-    int minorVersion() const;
+    CV_WRAP int minorVersion() const;

    //! alignment requirement for textures
-    size_t textureAlignment() const;
+    CV_WRAP size_t textureAlignment() const;

    //! pitch alignment requirement for texture references bound to pitched memory
-    size_t texturePitchAlignment() const;
+    CV_WRAP size_t texturePitchAlignment() const;

    //! number of multiprocessors on device
-    int multiProcessorCount() const;
+    CV_WRAP int multiProcessorCount() const;

    //! specified whether there is a run time limit on kernels
-    bool kernelExecTimeoutEnabled() const;
+    CV_WRAP bool kernelExecTimeoutEnabled() const;

    //! device is integrated as opposed to discrete
-    bool integrated() const;
+    CV_WRAP bool integrated() const;

    //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
-    bool canMapHostMemory() const;
+    CV_WRAP bool canMapHostMemory() const;

    enum ComputeMode
    {
@@ -762,108 +905,108 @@ public:
    };

    //! compute mode
-    ComputeMode computeMode() const;
+    CV_WRAP DeviceInfo::ComputeMode computeMode() const;

    //! maximum 1D texture size
-    int maxTexture1D() const;
+    CV_WRAP int maxTexture1D() const;

    //! maximum 1D mipmapped texture size
-    int maxTexture1DMipmap() const;
+    CV_WRAP int maxTexture1DMipmap() const;

    //! maximum size for 1D textures bound to linear memory
-    int maxTexture1DLinear() const;
+    CV_WRAP int maxTexture1DLinear() const;

    //! maximum 2D texture dimensions
-    Vec2i maxTexture2D() const;
+    CV_WRAP Vec2i maxTexture2D() const;

    //! maximum 2D mipmapped texture dimensions
-    Vec2i maxTexture2DMipmap() const;
+    CV_WRAP Vec2i maxTexture2DMipmap() const;

    //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
-    Vec3i maxTexture2DLinear() const;
+    CV_WRAP Vec3i maxTexture2DLinear() const;

    //! maximum 2D texture dimensions if texture gather operations have to be performed
-    Vec2i maxTexture2DGather() const;
+    CV_WRAP Vec2i maxTexture2DGather() const;

    //! maximum 3D texture dimensions
-    Vec3i maxTexture3D() const;
+    CV_WRAP Vec3i maxTexture3D() const;

    //! maximum Cubemap texture dimensions
-    int maxTextureCubemap() const;
+    CV_WRAP int maxTextureCubemap() const;

    //! maximum 1D layered texture dimensions
-    Vec2i maxTexture1DLayered() const;
+    CV_WRAP Vec2i maxTexture1DLayered() const;

    //! maximum 2D layered texture dimensions
-    Vec3i maxTexture2DLayered() const;
+    CV_WRAP Vec3i maxTexture2DLayered() const;

    //! maximum Cubemap layered texture dimensions
-    Vec2i maxTextureCubemapLayered() const;
+    CV_WRAP Vec2i maxTextureCubemapLayered() const;

    //! maximum 1D surface size
-    int maxSurface1D() const;
+    CV_WRAP int maxSurface1D() const;

    //! maximum 2D surface dimensions
-    Vec2i maxSurface2D() const;
+    CV_WRAP Vec2i maxSurface2D() const;

    //! maximum 3D surface dimensions
-    Vec3i maxSurface3D() const;
+    CV_WRAP Vec3i maxSurface3D() const;

    //! maximum 1D layered surface dimensions
-    Vec2i maxSurface1DLayered() const;
+    CV_WRAP Vec2i maxSurface1DLayered() const;

    //! maximum 2D layered surface dimensions
-    Vec3i maxSurface2DLayered() const;
+    CV_WRAP Vec3i maxSurface2DLayered() const;

    //! maximum Cubemap surface dimensions
-    int maxSurfaceCubemap() const;
+    CV_WRAP int maxSurfaceCubemap() const;

    //! maximum Cubemap layered surface dimensions
-    Vec2i maxSurfaceCubemapLayered() const;
+    CV_WRAP Vec2i maxSurfaceCubemapLayered() const;

    //! alignment requirements for surfaces
-    size_t surfaceAlignment() const;
+    CV_WRAP size_t surfaceAlignment() const;

    //! device can possibly execute multiple kernels concurrently
-    bool concurrentKernels() const;
+    CV_WRAP bool concurrentKernels() const;

    //! device has ECC support enabled
-    bool ECCEnabled() const;
+    CV_WRAP bool ECCEnabled() const;

    //! PCI bus ID of the device
-    int pciBusID() const;
+    CV_WRAP int pciBusID() const;

    //! PCI device ID of the device
-    int pciDeviceID() const;
+    CV_WRAP int pciDeviceID() const;

    //! PCI domain ID of the device
-    int pciDomainID() const;
+    CV_WRAP int pciDomainID() const;

    //! true if device is a Tesla device using TCC driver, false otherwise
-    bool tccDriver() const;
+    CV_WRAP bool tccDriver() const;

    //! number of asynchronous engines
-    int asyncEngineCount() const;
+    CV_WRAP int asyncEngineCount() const;

    //! device shares a unified address space with the host
-    bool unifiedAddressing() const;
+    CV_WRAP bool unifiedAddressing() const;

    //! peak memory clock frequency in kilohertz
-    int memoryClockRate() const;
+    CV_WRAP int memoryClockRate() const;

    //! global memory bus width in bits
-    int memoryBusWidth() const;
+    CV_WRAP int memoryBusWidth() const;

    //! size of L2 cache in bytes
-    int l2CacheSize() const;
+    CV_WRAP int l2CacheSize() const;

    //! maximum resident threads per multiprocessor
-    int maxThreadsPerMultiProcessor() const;
+    CV_WRAP int maxThreadsPerMultiProcessor() const;

    //! gets free and total device memory
-    void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
-    size_t freeMemory() const;
-    size_t totalMemory() const;
+    CV_WRAP void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
+    CV_WRAP size_t freeMemory() const;
+    CV_WRAP size_t totalMemory() const;

    /** @brief Provides information on CUDA feature support.

@@ -878,14 +1021,14 @@ public:
    This function returns true if the CUDA module can be run on the specified device. Otherwise, it
    returns false .
     */
-    bool isCompatible() const;
+    CV_WRAP bool isCompatible() const;

 private:
    int device_id_;
 };

-CV_EXPORTS void printCudaDeviceInfo(int device);
-CV_EXPORTS void printShortCudaDeviceInfo(int device);
+CV_EXPORTS_W void printCudaDeviceInfo(int device);
+CV_EXPORTS_W void printShortCudaDeviceInfo(int device);

 /** @brief Converts an array to half precision floating number.

--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/cuda.inl.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/cuda.inl.hpp
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/block.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/block.hpp
@@ -0,0 +1,211 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
+#define OPENCV_CUDA_DEVICE_BLOCK_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Block
+    {
+        static __device__ __forceinline__ unsigned int id()
+        {
+            return blockIdx.x;
+        }
+
+        static __device__ __forceinline__ unsigned int stride()
+        {
+            return blockDim.x * blockDim.y * blockDim.z;
+        }
+
+        static __device__ __forceinline__ void sync()
+        {
+            __syncthreads();
+        }
+
+        static __device__ __forceinline__ int flattenedThreadId()
+        {
+            return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+        }
+
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {
+            int STRIDE = stride();
+            It t = beg + flattenedThreadId();
+
+            for(; t < end; t += STRIDE)
+                *t = value;
+        }
+
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            int STRIDE = stride();
+            int tid = flattenedThreadId();
+            value += tid;
+
+            for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
+        {
+            int STRIDE = stride();
+            InIt  t = beg + flattenedThreadId();
+            OutIt o = out + (t - beg);
+
+            for(; t < end; t += STRIDE, o += STRIDE)
+                *o = *t;
+        }
+
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            int STRIDE = stride();
+            InIt  t = beg + flattenedThreadId();
+            OutIt o = out + (t - beg);
+
+            for(; t < end; t += STRIDE, o += STRIDE)
+                *o = op(*t);
+        }
+
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            int STRIDE = stride();
+            InIt1 t1 = beg1 + flattenedThreadId();
+            InIt2 t2 = beg2 + flattenedThreadId();
+            OutIt o  = out + (t1 - beg1);
+
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
+                *o = op(*t1, *t2);
+        }
+
+        template<int CTA_SIZE, typename T, class BinOp>
+        static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
+        {
+            int tid = flattenedThreadId();
+            T val =  buffer[tid];
+
+            if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+            if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+            if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+            if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+            if (tid < 32)
+            {
+                if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+            }
+        }
+
+        template<int CTA_SIZE, typename T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
+        {
+            int tid = flattenedThreadId();
+            T val =  buffer[tid] = init;
+            __syncthreads();
+
+            if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+            if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+            if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+            if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+            if (tid < 32)
+            {
+                if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+            }
+            __syncthreads();
+            return buffer[0];
+        }
+
+        template <typename T, class BinOp>
+        static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
+        {
+            int ftid = flattenedThreadId();
+            int sft = stride();
+
+            if (sft < n)
+            {
+                for (unsigned int i = sft + ftid; i < n; i += sft)
+                    data[ftid] = op(data[ftid], data[i]);
+
+                __syncthreads();
+
+                n = sft;
+            }
+
+            while (n > 1)
+            {
+                unsigned int half = n/2;
+
+                if (ftid < half)
+                    data[ftid] = op(data[ftid], data[n - ftid - 1]);
+
+                __syncthreads();
+
+                n = n - half;
+            }
+        }
+    };
+}}}
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_DEVICE_BLOCK_HPP */
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/border_interpolate.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/border_interpolate.hpp
@@ -0,0 +1,722 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_BORDER_INTERPOLATE_HPP
+#define OPENCV_CUDA_BORDER_INTERPOLATE_HPP
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    //////////////////////////////////////////////////////////////
+    // BrdConstant
+
+    template <typename D> struct BrdRowConstant
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return x >= 0 ? saturate_cast<D>(data[x]) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return x < width ? saturate_cast<D>(data[x]) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;
+        }
+
+        int width;
+        D val;
+    };
+
+    template <typename D> struct BrdColConstant
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        int height;
+        D val;
+    };
+
+    template <typename D> struct BrdConstant
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_)
+        {
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        int height;
+        int width;
+        D val;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReplicate
+
+    template <typename D> struct BrdRowReplicate
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReplicate
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::min(y, last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReplicate
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::min(y, last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReflect101
+
+    template <typename D> struct BrdRowReflect101
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReflect101
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReflect101
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReflect
+
+    template <typename D> struct BrdRowReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(::abs(x) - (x < 0));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(::abs(y) - (y < 0));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReflect
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (last_col - ::abs(last_col - x) + (x > last_col));
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdWrap
+
+    template <typename D> struct BrdRowWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (x < width) * x + (x >= width) * (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int width;
+    };
+
+    template <typename D> struct BrdColWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return (y < height) * y + (y >= height) * (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int height;
+    };
+
+    template <typename D> struct BrdWrap
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) :
+            height(height_), width(width_)
+        {
+        }
+        template <typename U>
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) :
+            height(height_), width(width_)
+        {
+        }
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) ? y : (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return (y < height) ? y : (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) ? x : (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (x < width) ? x : (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int height;
+        int width;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BorderReader
+
+    template <typename Ptr2D, typename B> struct BorderReader
+    {
+        typedef typename B::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}
+
+        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const
+        {
+            return b.at(y, x, ptr);
+        }
+
+        Ptr2D ptr;
+        B b;
+    };
+
+    // under win32 there is some bug with templated types that passed as kernel parameters
+    // with this specialization all works fine
+    template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >
+    {
+        typedef typename BrdConstant<D>::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) :
+            src(src_), height(b.height), width(b.width), val(b.val)
+        {
+        }
+
+        __device__ __forceinline__ D operator ()(index_type y, index_type x) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        Ptr2D src;
+        int height;
+        int width;
+        D val;
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_BORDER_INTERPOLATE_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/color.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/color.hpp
@@ -0,0 +1,309 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COLOR_HPP
+#define OPENCV_CUDA_COLOR_HPP
+
+#include "detail/color_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // All OPENCV_CUDA_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
+    // {
+    //     typedef ... functor_type;
+    //     static __host__ __device__ functor_type create_functor();
+    // };
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COLOR_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/common.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/common.hpp
@@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COMMON_HPP
+#define OPENCV_CUDA_COMMON_HPP
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda_types.hpp"
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+#ifndef CV_PI_F
+    #ifndef CV_PI
+        #define CV_PI_F 3.14159265f
+    #else
+        #define CV_PI_F ((float)CV_PI)
+    #endif
+#endif
+
+namespace cv { namespace cuda {
+    static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
+    {
+        if (cudaSuccess != err)
+            cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+    }
+}}
+
+#ifndef cudaSafeCall
+    #define cudaSafeCall(expr)  cv::cuda::checkCudaError(expr, __FILE__, __LINE__, CV_Func)
+#endif
+
+namespace cv { namespace cuda
+{
+    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+    {
+        return reinterpret_cast<size_t>(ptr) % size == 0;
+    }
+
+    static inline bool isAligned(size_t step, size_t size)
+    {
+        return step % size == 0;
+    }
+}}
+
+namespace cv { namespace cuda
+{
+    namespace device
+    {
+        __host__ __device__ __forceinline__ int divUp(int total, int grain)
+        {
+            return (total + grain - 1) / grain;
+        }
+
+        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+        }
+    }
+}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COMMON_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/datamov_utils.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/datamov_utils.hpp
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DATAMOV_UTILS_HPP
+#define OPENCV_CUDA_DATAMOV_UTILS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
+
+        // for Fermi memory space is detected automatically
+        template <typename T> struct ForceGlob
+        {
+            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
+        };
+
+    #else // __CUDA_ARCH__ >= 200
+
+        #if defined(_WIN64) || defined(__LP64__)
+            // 64-bit register modifier for inlined asm
+            #define OPENCV_CUDA_ASM_PTR "l"
+        #else
+            // 32-bit register modifier for inlined asm
+            #define OPENCV_CUDA_ASM_PTR "r"
+        #endif
+
+        template<class T> struct ForceGlob;
+
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(uchar,  u8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(schar,  s8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(char,   b8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (ushort, u16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (short,  s16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (uint,   u32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (int,    s32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (float,  f32, f)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (double, f64, d)
+
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB_B
+        #undef OPENCV_CUDA_ASM_PTR
+
+    #endif // __CUDA_ARCH__ >= 200
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_DATAMOV_UTILS_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/color_detail.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/color_detail.hpp
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/reduce.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/reduce.hpp
@@ -0,0 +1,365 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                CV_UNUSED(smem);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_REDUCE_DETAIL_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/reduce_key_val.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/reduce_key_val.hpp
@@ -0,0 +1,502 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_key_val_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+                For<I + 1, N>::loadToSmem(smem, data, tid);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, data, tid);
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+            {
+                thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+                For<I + 1, N>::copyShfl(val, delta, width);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+            {
+                thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+                For<I + 1, N>::copy(svals, val, tid, delta);
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(key) = reg;
+                    thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+                }
+
+                For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                         const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                         const CmpTuple& cmp,
+                                         unsigned int tid, unsigned int delta)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+                    thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+                }
+
+                For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+            {
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+            {
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                         const ValPointerTuple&, const ValReferenceTuple&,
+                                         const CmpTuple&,
+                                         unsigned int, unsigned int)
+            {
+            }
+        };
+
+        //////////////////////////////////////////////////////
+        // loadToSmem
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            smem[tid] = data;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            data = smem[tid];
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                   unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                     const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                     unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+        }
+
+        //////////////////////////////////////////////////////
+        // copyVals
+
+        template <typename V>
+        __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+        {
+            val = shfl_down(val, delta, width);
+        }
+        template <typename V>
+        __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+        {
+            svals[tid] = val = svals[tid + delta];
+        }
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // merge
+
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename K,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const Cmp& cmp,
+                                                  unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+                  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                              const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // Generic
+
+        template <unsigned int N> struct Generic
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                loadToSmem(skeys, key, tid);
+                loadValsToSmem(svals, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(skeys, key, svals, val, cmp, tid, 512);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(skeys, key, svals, val, cmp, tid, 256);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(skeys, key, svals, val, cmp, tid, 128);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(skeys, key, svals, val, cmp, tid, 64);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(skeys, key, svals, val, cmp, tid, 32);
+                }
+
+                if (tid < 16)
+                {
+                    merge(skeys, key, svals, val, cmp, tid, 16);
+                    merge(skeys, key, svals, val, cmp, tid, 8);
+                    merge(skeys, key, svals, val, cmp, tid, 4);
+                    merge(skeys, key, svals, val, cmp, tid, 2);
+                    merge(skeys, key, svals, val, cmp, tid, 1);
+                }
+            }
+        };
+
+        template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll
+        {
+            static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+            {
+                mergeShfl(key, val, cmp, I, N);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            }
+            static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                merge(skeys, key, svals, val, cmp, tid, I);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        };
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll<0, KP, KR, VP, VR, Cmp>
+        {
+            static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+            {
+            }
+            static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+            #if 0 // __CUDA_ARCH__ >= 300
+                CV_UNUSED(skeys);
+                CV_UNUSED(svals);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if 0 // __CUDA_ARCH__ >= 300
+                Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(skeys, key, tid);
+
+                if (tid < 32)
+                {
+                #if 0 // __CUDA_ARCH__ >= 300
+                    loadFromSmem(svals, val, tid);
+
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
+                #else
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/transform_detail.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/transform_detail.hpp
@@ -0,0 +1,392 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+#define OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+#include "../functional.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace transform_detail
+    {
+        //! Read Write Traits
+
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
+        {
+            typedef typename TypeVec<T, shift>::vec_type read_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
+        {
+            typedef typename TypeVec<T1, shift>::vec_type read_type1;
+            typedef typename TypeVec<T2, shift>::vec_type read_type2;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        //! Transform kernels
+
+        template <int shift> struct OpUnroller;
+        template <> struct OpUnroller<1>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+            }
+        };
+        template <> struct OpUnroller<2>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+            }
+        };
+        template <> struct OpUnroller<3>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+            }
+        };
+        template <> struct OpUnroller<4>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src.w);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src1.w, src2.w);
+            }
+        };
+        template <> struct OpUnroller<8>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+        {
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src_.rows)
+            {
+                const T* src = src_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
+                {
+                    const read_type src_n_el = ((const read_type*)src)[x];
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows && mask(y, x))
+            {
+                dst.ptr(y)[x] = op(src.ptr(y)[x]);
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
+            const Mask mask, const BinOp op)
+        {
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src1_.rows)
+            {
+                const T1* src1 = src1_.ptr(y);
+                const T2* src2 = src2_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+                {
+                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src1[real_x], src2[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
+            const Mask mask, const BinOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src1.cols && y < src1.rows && mask(y, x))
+            {
+                const T1 src1_data = src1.ptr(y)[x];
+                const T2 src2_data = src2.ptr(y)[x];
+                dst.ptr(y)[x] = op(src1_data, src2_data);
+            }
+        }
+
+        template <bool UseSmart> struct TransformDispatcher;
+        template<> struct TransformDispatcher<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
+
+                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
+
+                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template<> struct TransformDispatcher<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
+
+                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
+                    !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
+
+                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+    } // namespace transform_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TRANSFORM_DETAIL_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/type_traits_detail.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/type_traits_detail.hpp
@@ -0,0 +1,191 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+#define OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace type_traits_detail
+    {
+        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+
+        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+
+        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+
+        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+        template <> struct IsIntegral<char> { enum {value = 1}; };
+        template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+        template <typename T> struct IsFloat { enum {value = 0}; };
+        template <> struct IsFloat<float> { enum {value = 1}; };
+        template <> struct IsFloat<double> { enum {value = 1}; };
+
+        template <typename T> struct IsVec { enum {value = 0}; };
+        template <> struct IsVec<uchar1> { enum {value = 1}; };
+        template <> struct IsVec<uchar2> { enum {value = 1}; };
+        template <> struct IsVec<uchar3> { enum {value = 1}; };
+        template <> struct IsVec<uchar4> { enum {value = 1}; };
+        template <> struct IsVec<uchar8> { enum {value = 1}; };
+        template <> struct IsVec<char1> { enum {value = 1}; };
+        template <> struct IsVec<char2> { enum {value = 1}; };
+        template <> struct IsVec<char3> { enum {value = 1}; };
+        template <> struct IsVec<char4> { enum {value = 1}; };
+        template <> struct IsVec<char8> { enum {value = 1}; };
+        template <> struct IsVec<ushort1> { enum {value = 1}; };
+        template <> struct IsVec<ushort2> { enum {value = 1}; };
+        template <> struct IsVec<ushort3> { enum {value = 1}; };
+        template <> struct IsVec<ushort4> { enum {value = 1}; };
+        template <> struct IsVec<ushort8> { enum {value = 1}; };
+        template <> struct IsVec<short1> { enum {value = 1}; };
+        template <> struct IsVec<short2> { enum {value = 1}; };
+        template <> struct IsVec<short3> { enum {value = 1}; };
+        template <> struct IsVec<short4> { enum {value = 1}; };
+        template <> struct IsVec<short8> { enum {value = 1}; };
+        template <> struct IsVec<uint1> { enum {value = 1}; };
+        template <> struct IsVec<uint2> { enum {value = 1}; };
+        template <> struct IsVec<uint3> { enum {value = 1}; };
+        template <> struct IsVec<uint4> { enum {value = 1}; };
+        template <> struct IsVec<uint8> { enum {value = 1}; };
+        template <> struct IsVec<int1> { enum {value = 1}; };
+        template <> struct IsVec<int2> { enum {value = 1}; };
+        template <> struct IsVec<int3> { enum {value = 1}; };
+        template <> struct IsVec<int4> { enum {value = 1}; };
+        template <> struct IsVec<int8> { enum {value = 1}; };
+        template <> struct IsVec<float1> { enum {value = 1}; };
+        template <> struct IsVec<float2> { enum {value = 1}; };
+        template <> struct IsVec<float3> { enum {value = 1}; };
+        template <> struct IsVec<float4> { enum {value = 1}; };
+        template <> struct IsVec<float8> { enum {value = 1}; };
+        template <> struct IsVec<double1> { enum {value = 1}; };
+        template <> struct IsVec<double2> { enum {value = 1}; };
+        template <> struct IsVec<double3> { enum {value = 1}; };
+        template <> struct IsVec<double4> { enum {value = 1}; };
+        template <> struct IsVec<double8> { enum {value = 1}; };
+
+        template <class U> struct AddParameterType { typedef const U& type; };
+        template <class U> struct AddParameterType<U&> { typedef U& type; };
+        template <> struct AddParameterType<void> { typedef void type; };
+
+        template <class U> struct ReferenceTraits
+        {
+            enum { value = false };
+            typedef U type;
+        };
+        template <class U> struct ReferenceTraits<U&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct PointerTraits
+        {
+            enum { value = false };
+            typedef void type;
+        };
+        template <class U> struct PointerTraits<U*>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+        template <class U> struct PointerTraits<U*&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct UnConst
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnConst<const U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnConst<const U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+
+        template <class U> struct UnVolatile
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnVolatile<volatile U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnVolatile<volatile U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+    } // namespace type_traits_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+#define OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+
+#include "../datamov_utils.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace vec_distance_detail
+    {
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+
+                    dist.reduceIter(val1, val2);
+
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+
+                dist.reduceIter(val1, val2);
+
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    } // namespace vec_distance_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/dynamic_smem.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/dynamic_smem.hpp
@@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DYNAMIC_SMEM_HPP
+#define OPENCV_CUDA_DYNAMIC_SMEM_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<class T> struct DynamicSharedMem
+    {
+        __device__ __forceinline__ operator T*()
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+
+        __device__ __forceinline__ operator const T*() const
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+    };
+
+    // specialize for double to avoid unaligned memory access compile errors
+    template<> struct DynamicSharedMem<double>
+    {
+        __device__ __forceinline__ operator double*()
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+
+        __device__ __forceinline__ operator const double*() const
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+    };
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_DYNAMIC_SMEM_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/emulation.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/emulation.hpp
@@ -0,0 +1,269 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_EMULATION_HPP_
+#define OPENCV_CUDA_EMULATION_HPP_
+
+#include "common.hpp"
+#include "warp_reduce.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Emulation
+    {
+
+        static __device__ __forceinline__ int syncthreadsOr(int pred)
+        {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
+                // just campilation stab
+                return 0;
+#else
+                return __syncthreads_or(pred);
+#endif
+        }
+
+        template<int CTA_SIZE>
+        static __forceinline__ __device__ int Ballot(int predicate)
+        {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            return __ballot(predicate);
+#else
+            __shared__ volatile int cta_buffer[CTA_SIZE];
+
+            int tid = threadIdx.x;
+            cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
+            return warp_reduce(cta_buffer);
+#endif
+        }
+
+        struct smem
+        {
+            enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicInc(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count;
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+                do
+                {
+                    count = *address & TAG_MASK;
+                    count = tag | (count + 1);
+                    *address = count;
+                } while (*address != count);
+
+                return (count & TAG_MASK) - 1;
+#else
+                return ::atomicInc(address, val);
+#endif
+            }
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicAdd(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count;
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+                do
+                {
+                    count = *address & TAG_MASK;
+                    count = tag | (count + val);
+                    *address = count;
+                } while (*address != count);
+
+                return (count & TAG_MASK) - val;
+#else
+                return ::atomicAdd(address, val);
+#endif
+            }
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicMin(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count = ::min(*address, val);
+                do
+                {
+                    *address = count;
+                } while (*address > count);
+
+                return count;
+#else
+                return ::atomicMin(address, val);
+#endif
+            }
+        }; // struct cmem
+
+        struct glob
+        {
+            static __device__ __forceinline__ int atomicAdd(int* address, int val)
+            {
+                return ::atomicAdd(address, val);
+            }
+            static __device__ __forceinline__ unsigned int atomicAdd(unsigned int* address, unsigned int val)
+            {
+                return ::atomicAdd(address, val);
+            }
+            static __device__ __forceinline__ float atomicAdd(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 200
+                return ::atomicAdd(address, val);
+            #else
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(val + __int_as_float(assumed)));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #endif
+            }
+            static __device__ __forceinline__ double atomicAdd(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val + __longlong_as_double(assumed)));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+
+            static __device__ __forceinline__ int atomicMin(int* address, int val)
+            {
+                return ::atomicMin(address, val);
+            }
+            static __device__ __forceinline__ float atomicMin(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 120
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(::fminf(val, __int_as_float(assumed))));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0f;
+            #endif
+            }
+            static __device__ __forceinline__ double atomicMin(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+
+            static __device__ __forceinline__ int atomicMax(int* address, int val)
+            {
+                return ::atomicMax(address, val);
+            }
+            static __device__ __forceinline__ float atomicMax(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 120
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0f;
+            #endif
+            }
+            static __device__ __forceinline__ double atomicMax(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+        };
+    }; //struct Emulation
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_EMULATION_HPP_ */
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/filters.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/filters.hpp
@@ -0,0 +1,286 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_FILTERS_HPP
+#define OPENCV_CUDA_FILTERS_HPP
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+#include "type_traits.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename Ptr2D> struct PointFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            return src(__float2int_rz(y), __float2int_rz(x));
+        }
+
+        Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct LinearFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+            work_type out = VecTraits<work_type>::all(0);
+
+            const int x1 = __float2int_rd(x);
+            const int y1 = __float2int_rd(y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+
+            elem_type src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - x) * (y2 - y));
+
+            src_reg = src(y1, x2);
+            out = out + src_reg * ((x - x1) * (y2 - y));
+
+            src_reg = src(y2, x1);
+            out = out + src_reg * ((x2 - x) * (y - y1));
+
+            src_reg = src(y2, x2);
+            out = out + src_reg * ((x - x1) * (y - y1));
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct CubicFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+
+        static __device__ __forceinline__ float bicubicCoeff(float x_)
+        {
+            float x = fabsf(x_);
+            if (x <= 1.0f)
+            {
+                return x * x * (1.5f * x - 2.5f) + 1.0f;
+            }
+            else if (x < 2.0f)
+            {
+                return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+            }
+            else
+            {
+                return 0.0f;
+            }
+        }
+
+        __device__ elem_type operator ()(float y, float x) const
+        {
+            const float xmin = ::ceilf(x - 2.0f);
+            const float xmax = ::floorf(x + 2.0f);
+
+            const float ymin = ::ceilf(y - 2.0f);
+            const float ymax = ::floorf(y + 2.0f);
+
+            work_type sum = VecTraits<work_type>::all(0);
+            float wsum = 0.0f;
+
+            for (float cy = ymin; cy <= ymax; cy += 1.0f)
+            {
+                for (float cx = xmin; cx <= xmax; cx += 1.0f)
+                {
+                    const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+                    sum = sum + w * src(__float2int_rd(cy), __float2int_rd(cx));
+                    wsum += w;
+                }
+            }
+
+            work_type res = (!wsum)? VecTraits<work_type>::all(0) : sum / wsum;
+
+            return saturate_cast<elem_type>(res);
+        }
+
+        Ptr2D src;
+    };
+    // for integer scaling
+    template <typename Ptr2D> struct IntegerAreaFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ IntegerAreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
+            : src(src_), scale_x(scale_x_), scale_y(scale_y_), scale(1.f / (scale_x * scale_y)) {}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            float fsx1 = x * scale_x;
+            float fsx2 = fsx1 + scale_x;
+
+            int sx1 = __float2int_ru(fsx1);
+            int sx2 = __float2int_rd(fsx2);
+
+            float fsy1 = y * scale_y;
+            float fsy2 = fsy1 + scale_y;
+
+            int sy1 = __float2int_ru(fsy1);
+            int sy2 = __float2int_rd(fsy2);
+
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0.f);
+
+            for(int dy = sy1; dy < sy2; ++dy)
+                for(int dx = sx1; dx < sx2; ++dx)
+                {
+                    out = out + src(dy, dx) * scale;
+                }
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+        float scale_x, scale_y ,scale;
+    };
+
+    template <typename Ptr2D> struct AreaFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ AreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
+            : src(src_), scale_x(scale_x_), scale_y(scale_y_){}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            float fsx1 = x * scale_x;
+            float fsx2 = fsx1 + scale_x;
+
+            int sx1 = __float2int_ru(fsx1);
+            int sx2 = __float2int_rd(fsx2);
+
+            float fsy1 = y * scale_y;
+            float fsy2 = fsy1 + scale_y;
+
+            int sy1 = __float2int_ru(fsy1);
+            int sy2 = __float2int_rd(fsy2);
+
+            float scale = 1.f / (fminf(scale_x, src.width - fsx1) * fminf(scale_y, src.height - fsy1));
+
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0.f);
+
+            for (int dy = sy1; dy < sy2; ++dy)
+            {
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src(dy, dx) * scale;
+
+                if (sx1 > fsx1)
+                    out = out + src(dy, (sx1 -1) ) * ((sx1 - fsx1) * scale);
+
+                if (sx2 < fsx2)
+                    out = out + src(dy, sx2) * ((fsx2 -sx2) * scale);
+            }
+
+            if (sy1 > fsy1)
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src( (sy1 - 1) , dx) * ((sy1 -fsy1) * scale);
+
+            if (sy2 < fsy2)
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src(sy2, dx) * ((fsy2 -sy2) * scale);
+
+            if ((sy1 > fsy1) &&  (sx1 > fsx1))
+                out = out + src( (sy1 - 1) , (sx1 - 1)) * ((sy1 -fsy1) * (sx1 -fsx1) * scale);
+
+            if ((sy1 > fsy1) &&  (sx2 < fsx2))
+                out = out + src( (sy1 - 1) , sx2) * ((sy1 -fsy1) * (fsx2 -sx2) * scale);
+
+            if ((sy2 < fsy2) &&  (sx2 < fsx2))
+                out = out + src(sy2, sx2) * ((fsy2 -sy2) * (fsx2 -sx2) * scale);
+
+            if ((sy2 < fsy2) &&  (sx1 > fsx1))
+                out = out + src(sy2, (sx1 - 1)) * ((fsy2 -sy2) * (sx1 -fsx1) * scale);
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+        float scale_x, scale_y;
+        int width, haight;
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_FILTERS_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/funcattrib.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/funcattrib.hpp
@@ -40,34 +40,40 @@
 //
 //M*/

-#ifndef OPENCV_OLD_CV_H
-#define OPENCV_OLD_CV_H
+#ifndef OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP
+#define OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP

-#if defined(_MSC_VER)
-    #define CV_DO_PRAGMA(x) __pragma(x)
-    #define __CVSTR2__(x) #x
-    #define __CVSTR1__(x) __CVSTR2__(x)
-    #define __CVMSVCLOC__ __FILE__ "("__CVSTR1__(__LINE__)") : "
-    #define CV_MSG_PRAGMA(_msg) CV_DO_PRAGMA(message (__CVMSVCLOC__ _msg))
-#elif defined(__GNUC__)
-    #define CV_DO_PRAGMA(x) _Pragma (#x)
-    #define CV_MSG_PRAGMA(_msg) CV_DO_PRAGMA(message (_msg))
-#else
-    #define CV_DO_PRAGMA(x)
-    #define CV_MSG_PRAGMA(_msg)
-#endif
-#define CV_WARNING(x) CV_MSG_PRAGMA("Warning: " #x)
+#include <cstdio>

-//CV_WARNING("This is a deprecated opencv header provided for compatibility. Please include a header from a corresponding opencv module")
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */

-#include "opencv2/core/core_c.h"
-#include "opencv2/imgproc/imgproc_c.h"
-#include "opencv2/photo/photo_c.h"
-#include "opencv2/video/tracking_c.h"
-#include "opencv2/objdetect/objdetect_c.h"
+//! @cond IGNORED

-#if !defined(CV_IMPL)
-#define CV_IMPL extern "C"
-#endif //CV_IMPL
+namespace cv { namespace cuda { namespace device
+{
+    template<class Func>
+    void printFuncAttrib(Func& func)
+    {

-#endif // __OPENCV_OLD_CV_H_
+        cudaFuncAttributes attrs;
+        cudaFuncGetAttributes(&attrs, func);
+
+        printf("=== Function stats ===\n");
+        printf("Name: \n");
+        printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
+        printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
+        printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
+        printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
+        printf("numRegs            = %d\n", attrs.numRegs);
+        printf("ptxVersion         = %d\n", attrs.ptxVersion);
+        printf("binaryVersion      = %d\n", attrs.binaryVersion);
+        printf("\n");
+        fflush(stdout);
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif  /* OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP */
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/functional.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/functional.hpp
@@ -0,0 +1,805 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_FUNCTIONAL_HPP
+#define OPENCV_CUDA_FUNCTIONAL_HPP
+
+#include <functional>
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "type_traits.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // Function Objects
+    template<typename Argument, typename Result> struct unary_function
+    {
+        typedef Argument argument_type;
+        typedef Result result_type;
+    };
+    template<typename Argument1, typename Argument2, typename Result> struct binary_function
+    {
+        typedef Argument1 first_argument_type;
+        typedef Argument2 second_argument_type;
+        typedef Result result_type;
+    };
+
+    // Arithmetic Operations
+    template <typename T> struct plus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a + b;
+        }
+        __host__ __device__ __forceinline__ plus() {}
+        __host__ __device__ __forceinline__ plus(const plus&) {}
+    };
+
+    template <typename T> struct minus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a - b;
+        }
+        __host__ __device__ __forceinline__ minus() {}
+        __host__ __device__ __forceinline__ minus(const minus&) {}
+    };
+
+    template <typename T> struct multiplies : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a * b;
+        }
+        __host__ __device__ __forceinline__ multiplies() {}
+        __host__ __device__ __forceinline__ multiplies(const multiplies&) {}
+    };
+
+    template <typename T> struct divides : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a / b;
+        }
+        __host__ __device__ __forceinline__ divides() {}
+        __host__ __device__ __forceinline__ divides(const divides&) {}
+    };
+
+    template <typename T> struct modulus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a % b;
+        }
+        __host__ __device__ __forceinline__ modulus() {}
+        __host__ __device__ __forceinline__ modulus(const modulus&) {}
+    };
+
+    template <typename T> struct negate : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return -a;
+        }
+        __host__ __device__ __forceinline__ negate() {}
+        __host__ __device__ __forceinline__ negate(const negate&) {}
+    };
+
+    // Comparison Operations
+    template <typename T> struct equal_to : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a == b;
+        }
+        __host__ __device__ __forceinline__ equal_to() {}
+        __host__ __device__ __forceinline__ equal_to(const equal_to&) {}
+    };
+
+    template <typename T> struct not_equal_to : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a != b;
+        }
+        __host__ __device__ __forceinline__ not_equal_to() {}
+        __host__ __device__ __forceinline__ not_equal_to(const not_equal_to&) {}
+    };
+
+    template <typename T> struct greater : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a > b;
+        }
+        __host__ __device__ __forceinline__ greater() {}
+        __host__ __device__ __forceinline__ greater(const greater&) {}
+    };
+
+    template <typename T> struct less : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a < b;
+        }
+        __host__ __device__ __forceinline__ less() {}
+        __host__ __device__ __forceinline__ less(const less&) {}
+    };
+
+    template <typename T> struct greater_equal : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a >= b;
+        }
+        __host__ __device__ __forceinline__ greater_equal() {}
+        __host__ __device__ __forceinline__ greater_equal(const greater_equal&) {}
+    };
+
+    template <typename T> struct less_equal : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a <= b;
+        }
+        __host__ __device__ __forceinline__ less_equal() {}
+        __host__ __device__ __forceinline__ less_equal(const less_equal&) {}
+    };
+
+    // Logical Operations
+    template <typename T> struct logical_and : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a && b;
+        }
+        __host__ __device__ __forceinline__ logical_and() {}
+        __host__ __device__ __forceinline__ logical_and(const logical_and&) {}
+    };
+
+    template <typename T> struct logical_or : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a || b;
+        }
+        __host__ __device__ __forceinline__ logical_or() {}
+        __host__ __device__ __forceinline__ logical_or(const logical_or&) {}
+    };
+
+    template <typename T> struct logical_not : unary_function<T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return !a;
+        }
+        __host__ __device__ __forceinline__ logical_not() {}
+        __host__ __device__ __forceinline__ logical_not(const logical_not&) {}
+    };
+
+    // Bitwise Operations
+    template <typename T> struct bit_and : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a & b;
+        }
+        __host__ __device__ __forceinline__ bit_and() {}
+        __host__ __device__ __forceinline__ bit_and(const bit_and&) {}
+    };
+
+    template <typename T> struct bit_or : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a | b;
+        }
+        __host__ __device__ __forceinline__ bit_or() {}
+        __host__ __device__ __forceinline__ bit_or(const bit_or&) {}
+    };
+
+    template <typename T> struct bit_xor : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a ^ b;
+        }
+        __host__ __device__ __forceinline__ bit_xor() {}
+        __host__ __device__ __forceinline__ bit_xor(const bit_xor&) {}
+    };
+
+    template <typename T> struct bit_not : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
+        {
+            return ~v;
+        }
+        __host__ __device__ __forceinline__ bit_not() {}
+        __host__ __device__ __forceinline__ bit_not(const bit_not&) {}
+    };
+
+    // Generalized Identity Operations
+    template <typename T> struct identity : unary_function<T, T>
+    {
+        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return x;
+        }
+        __host__ __device__ __forceinline__ identity() {}
+        __host__ __device__ __forceinline__ identity(const identity&) {}
+    };
+
+    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
+    {
+        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
+        {
+            return lhs;
+        }
+        __host__ __device__ __forceinline__ project1st() {}
+        __host__ __device__ __forceinline__ project1st(const project1st&) {}
+    };
+
+    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
+    {
+        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
+        {
+            return rhs;
+        }
+        __host__ __device__ __forceinline__ project2nd() {}
+        __host__ __device__ __forceinline__ project2nd(const project2nd&) {}
+    };
+
+    // Min/Max Operations
+
+#define OPENCV_CUDA_IMPLEMENT_MINMAX(name, type, op) \
+    template <> struct name<type> : binary_function<type, type, type> \
+    { \
+        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
+        __host__ __device__ __forceinline__ name() {}\
+        __host__ __device__ __forceinline__ name(const name&) {}\
+    };
+
+    template <typename T> struct maximum : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
+        {
+            return max(lhs, rhs);
+        }
+        __host__ __device__ __forceinline__ maximum() {}
+        __host__ __device__ __forceinline__ maximum(const maximum&) {}
+    };
+
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, uchar, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, schar, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, char, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, ushort, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, short, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, int, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, uint, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, float, ::fmax)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, double, ::fmax)
+
+    template <typename T> struct minimum : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
+        {
+            return min(lhs, rhs);
+        }
+        __host__ __device__ __forceinline__ minimum() {}
+        __host__ __device__ __forceinline__ minimum(const minimum&) {}
+    };
+
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, uchar, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, schar, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, char, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, ushort, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, short, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, int, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, uint, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, float, ::fmin)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, double, ::fmin)
+
+#undef OPENCV_CUDA_IMPLEMENT_MINMAX
+
+    // Math functions
+
+    template <typename T> struct abs_func : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return abs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
+    {
+        __device__ __forceinline__ unsigned char operator ()(unsigned char x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
+    {
+        __device__ __forceinline__ signed char operator ()(signed char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<char> : unary_function<char, char>
+    {
+        __device__ __forceinline__ char operator ()(char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
+    {
+        __device__ __forceinline__ unsigned short operator ()(unsigned short x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<short> : unary_function<short, short>
+    {
+        __device__ __forceinline__ short operator ()(short x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
+    {
+        __device__ __forceinline__ unsigned int operator ()(unsigned int x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<int> : unary_function<int, int>
+    {
+        __device__ __forceinline__ int operator ()(int x) const
+        {
+            return ::abs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<float> : unary_function<float, float>
+    {
+        __device__ __forceinline__ float operator ()(float x) const
+        {
+            return ::fabsf(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<double> : unary_function<double, double>
+    {
+        __device__ __forceinline__ double operator ()(double x) const
+        {
+            return ::fabs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+
+#define OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(name, func) \
+    template <typename T> struct name ## _func : unary_function<T, float> \
+    { \
+        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \
+        { \
+            return func ## f(v); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    }; \
+    template <> struct name ## _func<double> : unary_function<double, double> \
+    { \
+        __device__ __forceinline__ double operator ()(double v) const \
+        { \
+            return func(v); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    };
+
+#define OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(name, func) \
+    template <typename T> struct name ## _func : binary_function<T, T, float> \
+    { \
+        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \
+        { \
+            return func ## f(v1, v2); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    }; \
+    template <> struct name ## _func<double> : binary_function<double, double, double> \
+    { \
+        __device__ __forceinline__ double operator ()(double v1, double v2) const \
+        { \
+            return func(v1, v2); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    };
+
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log, ::log)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
+
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
+
+    #undef OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR
+    #undef OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR_NO_DOUBLE
+    #undef OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR
+
+    template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
+        {
+            return src1 * src1 + src2 * src2;
+        }
+        __host__ __device__ __forceinline__ hypot_sqr_func() {}
+        __host__ __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func&) {}
+    };
+
+    // Saturate Cast Functor
+    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
+    {
+        __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
+        {
+            return saturate_cast<D>(v);
+        }
+        __host__ __device__ __forceinline__ saturate_cast_func() {}
+        __host__ __device__ __forceinline__ saturate_cast_func(const saturate_cast_func&) {}
+    };
+
+    // Threshold Functors
+    template <typename T> struct thresh_binary_func : unary_function<T, T>
+    {
+        __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src > thresh) * maxVal;
+        }
+
+        __host__ __device__ __forceinline__ thresh_binary_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}
+
+        T thresh;
+        T maxVal;
+    };
+
+    template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
+    {
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src <= thresh) * maxVal;
+        }
+
+        __host__ __device__ __forceinline__ thresh_binary_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}
+
+        T thresh;
+        T maxVal;
+    };
+
+    template <typename T> struct thresh_trunc_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return minimum<T>()(src, thresh);
+        }
+
+        __host__ __device__ __forceinline__ thresh_trunc_func() {}
+        __host__ __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    template <typename T> struct thresh_to_zero_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src > thresh) * src;
+        }
+
+        __host__ __device__ __forceinline__ thresh_to_zero_func() {}
+       __host__  __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src <= thresh) * src;
+        }
+
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    // Function Object Adaptors
+    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
+    {
+      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
+
+      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
+      {
+          return !pred(x);
+      }
+
+      __host__ __device__ __forceinline__ unary_negate() {}
+      __host__ __device__ __forceinline__ unary_negate(const unary_negate& other) : pred(other.pred) {}
+
+      Predicate pred;
+    };
+
+    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
+    {
+        return unary_negate<Predicate>(pred);
+    }
+
+    template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
+    {
+        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
+
+        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
+                                                   typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
+        {
+            return !pred(x,y);
+        }
+
+        __host__ __device__ __forceinline__ binary_negate() {}
+        __host__ __device__ __forceinline__ binary_negate(const binary_negate& other) : pred(other.pred) {}
+
+        Predicate pred;
+    };
+
+    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
+    {
+        return binary_negate<BinaryPredicate>(pred);
+    }
+
+    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
+    {
+        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
+
+        __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
+        {
+            return op(arg1, a);
+        }
+
+        __host__ __device__ __forceinline__ binder1st() {}
+        __host__ __device__ __forceinline__ binder1st(const binder1st& other) : op(other.op), arg1(other.arg1) {}
+
+        Op op;
+        typename Op::first_argument_type arg1;
+    };
+
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
+    {
+        return binder1st<Op>(op, typename Op::first_argument_type(x));
+    }
+
+    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
+    {
+        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
+
+        __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
+        {
+            return op(a, arg2);
+        }
+
+        __host__ __device__ __forceinline__ binder2nd() {}
+        __host__ __device__ __forceinline__ binder2nd(const binder2nd& other) : op(other.op), arg2(other.arg2) {}
+
+        Op op;
+        typename Op::second_argument_type arg2;
+    };
+
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
+    {
+        return binder2nd<Op>(op, typename Op::second_argument_type(x));
+    }
+
+    // Functor Traits
+    template <typename F> struct IsUnaryFunction
+    {
+        typedef char Yes;
+        struct No {Yes a[2];};
+
+        template <typename T, typename D> static Yes check(unary_function<T, D>);
+        static No check(...);
+
+        static F makeF();
+
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+    };
+
+    template <typename F> struct IsBinaryFunction
+    {
+        typedef char Yes;
+        struct No {Yes a[2];};
+
+        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
+        static No check(...);
+
+        static F makeF();
+
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+    };
+
+    namespace functional_detail
+    {
+        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
+
+        template <typename T, typename D> struct DefaultUnaryShift
+        {
+            enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
+        };
+
+        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
+
+        template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+        {
+            enum { shift = BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
+        };
+
+        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
+        template <typename Func> struct ShiftDispatcher<Func, true>
+        {
+            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
+        };
+        template <typename Func> struct ShiftDispatcher<Func, false>
+        {
+            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+        };
+    }
+
+    template <typename Func> struct DefaultTransformShift
+    {
+        enum { shift = functional_detail::ShiftDispatcher<Func>::shift };
+    };
+
+    template <typename Func> struct DefaultTransformFunctorTraits
+    {
+        enum { simple_block_dim_x = 16 };
+        enum { simple_block_dim_y = 16 };
+
+        enum { smart_block_dim_x = 16 };
+        enum { smart_block_dim_y = 16 };
+        enum { smart_shift = DefaultTransformShift<Func>::shift };
+    };
+
+    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
+
+#define OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(type) \
+    template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_FUNCTIONAL_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/limits.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/limits.hpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_LIMITS_HPP
+#define OPENCV_CUDA_LIMITS_HPP
+
+#include <limits.h>
+#include <float.h>
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+template <class T> struct numeric_limits;
+
+template <> struct numeric_limits<bool>
+{
+    __device__ __forceinline__ static bool min() { return false; }
+    __device__ __forceinline__ static bool max() { return true;  }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<signed char>
+{
+    __device__ __forceinline__ static signed char min() { return SCHAR_MIN; }
+    __device__ __forceinline__ static signed char max() { return SCHAR_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned char>
+{
+    __device__ __forceinline__ static unsigned char min() { return 0; }
+    __device__ __forceinline__ static unsigned char max() { return UCHAR_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<short>
+{
+    __device__ __forceinline__ static short min() { return SHRT_MIN; }
+    __device__ __forceinline__ static short max() { return SHRT_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned short>
+{
+    __device__ __forceinline__ static unsigned short min() { return 0; }
+    __device__ __forceinline__ static unsigned short max() { return USHRT_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<int>
+{
+    __device__ __forceinline__ static int min() { return INT_MIN; }
+    __device__ __forceinline__ static int max() { return INT_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned int>
+{
+    __device__ __forceinline__ static unsigned int min() { return 0; }
+    __device__ __forceinline__ static unsigned int max() { return UINT_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<float>
+{
+    __device__ __forceinline__ static float min() { return FLT_MIN; }
+    __device__ __forceinline__ static float max() { return FLT_MAX; }
+    __device__ __forceinline__ static float epsilon() { return FLT_EPSILON; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<double>
+{
+    __device__ __forceinline__ static double min() { return DBL_MIN; }
+    __device__ __forceinline__ static double max() { return DBL_MAX; }
+    __device__ __forceinline__ static double epsilon() { return DBL_EPSILON; }
+    static const bool is_signed = true;
+};
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+//! @endcond
+
+#endif // OPENCV_CUDA_LIMITS_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/reduce.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/reduce.hpp
@@ -0,0 +1,209 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_REDUCE_HPP
+#define OPENCV_CUDA_REDUCE_HPP
+
+#ifndef THRUST_DEBUG // eliminate -Wundef warning
+#define THRUST_DEBUG 0
+#endif
+
+#include <thrust/tuple.h>
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <int N, typename T, class Op>
+    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+    }
+    template <int N,
+              typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                           const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                           unsigned int tid,
+                                           const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+                const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+                const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N, typename K, typename V, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename K,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+              typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                                 const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid,
+                                                 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+                const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+                >(skeys, key, svals, val, tid, cmp);
+    }
+
+    // smem_tuple
+
+    template <typename T0>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*>
+    smem_tuple(T0* t0)
+    {
+        return thrust::make_tuple((volatile T0*) t0);
+    }
+
+    template <typename T0, typename T1>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*>
+    smem_tuple(T0* t0, T1* t1)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
+    }
+
+    template <typename T0, typename T1, typename T2>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
+    smem_tuple(T0* t0, T1* t1, T2* t2)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_REDUCE_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/saturate_cast.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/saturate_cast.hpp
@@ -0,0 +1,292 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_SATURATE_CAST_HPP
+#define OPENCV_CUDA_SATURATE_CAST_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
+
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+    {
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<uchar>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+    {
+        uint res = 0;
+        uint vi = v;
+        asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
+    {
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<schar>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+    {
+        ushort res = 0;
+        int vi = v;
+        asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+    {
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<ushort>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
+    {
+        short res = 0;
+        asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        short res = 0;
+        asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<short>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
+    {
+        int res = 0;
+        asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
+    {
+        return __float2int_rn(v);
+    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        return __double2int_rn(v);
+    #else
+        return saturate_cast<int>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
+    {
+        return __float2uint_rn(v);
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        return __double2uint_rn(v);
+    #else
+        return saturate_cast<uint>((float)v);
+    #endif
+    }
+}}}
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_SATURATE_CAST_HPP */
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/scan.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/scan.hpp
@@ -0,0 +1,258 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_SCAN_HPP
+#define OPENCV_CUDA_SCAN_HPP
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/warp.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };
+
+    template <ScanKind Kind, typename T, typename F> struct WarpScan
+    {
+        __device__ __forceinline__ WarpScan() {}
+        __device__ __forceinline__ WarpScan(const WarpScan& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+        {
+            const unsigned int lane = idx & 31;
+            F op;
+
+            if ( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+            if ( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+            if ( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+            if ( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+            if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+            if( Kind == INCLUSIVE )
+                return ptr [idx];
+            else
+                return (lane > 0) ? ptr [idx - 1] : 0;
+        }
+
+        __device__ __forceinline__ unsigned int index(const unsigned int tid)
+        {
+            return tid;
+        }
+
+        __device__ __forceinline__ void init(volatile T *ptr){}
+
+        static const int warp_offset      = 0;
+
+        typedef WarpScan<INCLUSIVE, T, F>  merge;
+    };
+
+    template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
+    {
+        __device__ __forceinline__ WarpScanNoComp() {}
+        __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+        {
+            const unsigned int lane = threadIdx.x & 31;
+            F op;
+
+            ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+            ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+            if( Kind == INCLUSIVE )
+                return ptr [idx];
+            else
+                return (lane > 0) ? ptr [idx - 1] : 0;
+        }
+
+        __device__ __forceinline__ unsigned int index(const unsigned int tid)
+        {
+            return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
+        }
+
+        __device__ __forceinline__ void init(volatile T *ptr)
+        {
+            ptr[threadIdx.x] = 0;
+        }
+
+        static const int warp_smem_stride = 32 + 16 + 1;
+        static const int warp_offset      = 16;
+        static const int warp_log         = 5;
+        static const int warp_mask        = 31;
+
+        typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
+    };
+
+    template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
+    {
+        __device__ __forceinline__ BlockScan() {}
+        __device__ __forceinline__ BlockScan(const BlockScan& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()(volatile T *ptr)
+        {
+            const unsigned int tid  = threadIdx.x;
+            const unsigned int lane = tid & warp_mask;
+            const unsigned int warp = tid >> warp_log;
+
+            Sc scan;
+            typename Sc::merge merge_scan;
+            const unsigned int idx = scan.index(tid);
+
+            T val = scan(ptr, idx);
+            __syncthreads ();
+
+            if( warp == 0)
+                scan.init(ptr);
+            __syncthreads ();
+
+            if( lane == 31 )
+                ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
+            __syncthreads ();
+
+            if( warp == 0 )
+                merge_scan(ptr, idx);
+            __syncthreads();
+
+            if ( warp > 0)
+                val = ptr [scan.warp_offset + warp - 1] + val;
+            __syncthreads ();
+
+            ptr[idx] = val;
+            __syncthreads ();
+
+            return val ;
+        }
+
+        static const int warp_log  = 5;
+        static const int warp_mask = 31;
+    };
+
+    template <typename T>
+    __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+    #if __CUDA_ARCH__ >= 300
+        const unsigned int laneId = cv::cuda::device::Warp::laneId();
+
+        // scan on shuffl functions
+        #pragma unroll
+        for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
+        {
+            const T n = cv::cuda::device::shfl_up(idata, i);
+            if (laneId >= i)
+                  idata += n;
+        }
+
+        return idata;
+    #else
+        unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
+        s_Data[pos] = 0;
+        pos += OPENCV_CUDA_WARP_SIZE;
+        s_Data[pos] = idata;
+
+        s_Data[pos] += s_Data[pos - 1];
+        s_Data[pos] += s_Data[pos - 2];
+        s_Data[pos] += s_Data[pos - 4];
+        s_Data[pos] += s_Data[pos - 8];
+        s_Data[pos] += s_Data[pos - 16];
+
+        return s_Data[pos];
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+        return warpScanInclusive(idata, s_Data, tid) - idata;
+    }
+
+    template <int tiNumScanThreads, typename T>
+    __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+        if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
+        {
+            //Bottom-level inclusive warp scan
+            T warpResult = warpScanInclusive(idata, s_Data, tid);
+
+            //Save top elements of each warp for exclusive warp scan
+            //sync to wait for warp scans to complete (because s_Data is being overwritten)
+            __syncthreads();
+            if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
+            {
+                s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
+            }
+
+            //wait for warp scans to complete
+            __syncthreads();
+
+            if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
+            {
+                //grab top warp elements
+                T val = s_Data[tid];
+                //calculate exclusive scan and write back to shared memory
+                s_Data[tid] = warpScanExclusive(val, s_Data, tid);
+            }
+
+            //return updated warp scans with exclusive scan results
+            __syncthreads();
+
+            return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
+        }
+        else
+        {
+            return warpScanInclusive(idata, s_Data, tid);
+        }
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_SCAN_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/simd_functions.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/simd_functions.hpp
@@ -0,0 +1,869 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ *   Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ *   Neither the name of NVIDIA Corporation nor the names of its contributors
+ *   may be used to endorse or promote products derived from this software
+ *   without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
+#define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // 2
+
+    static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a + b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
+        r = r - s;          // subtract out carry-out from low word
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a - b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // borrow to high word
+        r = r + s;          // compensate for borrow from low word
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u, v;
+        s = a & 0x0000ffff; // extract low halfword
+        r = b & 0x0000ffff; // extract low halfword
+        u = ::max(r, s);    // maximum of low halfwords
+        v = ::min(r, s);    // minimum of low halfwords
+        s = a & 0xffff0000; // extract high halfword
+        r = b & 0xffff0000; // extract high halfword
+        t = ::max(r, s);    // maximum of high halfwords
+        s = ::min(r, s);    // minimum of high halfwords
+        r = u | t;          // maximum of both halfwords
+        s = v | s;          // minimum of both halfwords
+        r = r - s;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
+        s = s >> 1;
+        s = r + s;
+
+        return s;
+    }
+
+    static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int s;
+        s = a ^ b;
+        r = a | b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
+        s = s >> 1;
+        r = r - s;
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::max(r, s);    // maximum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::max(r, s);    // maximum of high halfwords
+        r = t | u;          // combine halfword maximums
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::min(r, s);    // minimum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::min(r, s);    // minimum of high halfwords
+        r = t | u;          // combine halfword minimums
+    #endif
+
+        return r;
+    }
+
+    // 4
+
+    static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ b;          // sum bits
+        r = a & 0x7f7f7f7f; // clear msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // msb sum bits
+        r = r + t;          // add without msbs, record carry-out in msbs
+        r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
+    #endif /* __CUDA_ARCH__ >= 300 */
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ ~b;         // inverted sum bits
+        r = a | 0x80808080; // set msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // inverted msb sum bits
+        r = r - t;          // subtract w/o msbs, record inverted borrows in msb
+        r = r ^ s;          // combine inverted msb sum bits and borrows
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        s = s >> 1;
+        s = r + s;
+
+        return s;
+    }
+
+    static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int c;
+        c = a ^ b;
+        r = a | b;
+        c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        c = c >> 1;
+        r = r - c;
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r & ~c;         // msb = 1, if r was 0x00
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, t;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq4(a, b);
+        t = r << 8;         // convert bool
+        r = t - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        t = a ^ b;          // 0x00 if a == b
+        r = t | 0x80808080; // set msbs, to catch carry out
+        t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
+        r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
+        r = t & ~r;         // msb = 1, if t was 0x00
+        t = r >> 7;         // build mask
+        t = r - t;          //  from
+        r = t | r;          //   msbs
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge4(a, b);
+        s = r << 8;         // convert bool
+        r = s - r;          //  to mask
+    #else
+        asm ("not.b32 %0,%0;" : "+r"(b));
+        r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
+        r = r & 0x80808080; // msb = carry-outs
+        s = r >> 7;         // build mask
+        s = r - s;          //  from
+        r = s | r;          //   msbs
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a ^ b;          //
+        s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
+        r = s ^ r;          // select a when b >= a, else select b => min(a,b)
+        r = s - r;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+
+        return r;           // byte-wise unsigned maximum
+    }
+
+    static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(b, a);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+
+        return r;
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_SIMD_FUNCTIONS_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/transform.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/transform.hpp
@@ -0,0 +1,75 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TRANSFORM_HPP
+#define OPENCV_CUDA_TRANSFORM_HPP
+
+#include "common.hpp"
+#include "utility.hpp"
+#include "detail/transform_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename D, typename UnOp, typename Mask>
+    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<UnOp> ft;
+        transform_detail::TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    static inline void transform(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<BinOp> ft;
+        transform_detail::TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TRANSFORM_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/type_traits.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/type_traits.hpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TYPE_TRAITS_HPP
+#define OPENCV_CUDA_TYPE_TRAITS_HPP
+
+#include "detail/type_traits_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct IsSimpleParameter
+    {
+        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
+            type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
+    };
+
+    template <typename T> struct TypeTraits
+    {
+        typedef typename type_traits_detail::UnConst<T>::type                                                NonConstType;
+        typedef typename type_traits_detail::UnVolatile<T>::type                                             NonVolatileType;
+        typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type UnqualifiedType;
+        typedef typename type_traits_detail::PointerTraits<UnqualifiedType>::type                            PointeeType;
+        typedef typename type_traits_detail::ReferenceTraits<T>::type                                        ReferredType;
+
+        enum { isConst          = type_traits_detail::UnConst<T>::value };
+        enum { isVolatile       = type_traits_detail::UnVolatile<T>::value };
+
+        enum { isReference      = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
+        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
+
+        enum { isUnsignedInt    = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
+        enum { isSignedInt      = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
+        enum { isIntegral       = type_traits_detail::IsIntegral<UnqualifiedType>::value };
+        enum { isFloat          = type_traits_detail::IsFloat<UnqualifiedType>::value };
+        enum { isArith          = isIntegral || isFloat };
+        enum { isVec            = type_traits_detail::IsVec<UnqualifiedType>::value };
+
+        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
+            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
+    };
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TYPE_TRAITS_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/utility.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/utility.hpp
@@ -0,0 +1,230 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_UTILITY_HPP
+#define OPENCV_CUDA_UTILITY_HPP
+
+#include "saturate_cast.hpp"
+#include "datamov_utils.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct CV_EXPORTS ThrustAllocator
+    {
+        typedef uchar value_type;
+        virtual ~ThrustAllocator();
+        virtual __device__ __host__ uchar* allocate(size_t numBytes) = 0;
+        virtual __device__ __host__ void deallocate(uchar* ptr, size_t numBytes) = 0;
+        static ThrustAllocator& getAllocator();
+        static void setAllocator(ThrustAllocator* allocator);
+    };
+    #define OPENCV_CUDA_LOG_WARP_SIZE        (5)
+    #define OPENCV_CUDA_WARP_SIZE            (1 << OPENCV_CUDA_LOG_WARP_SIZE)
+    #define OPENCV_CUDA_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
+    #define OPENCV_CUDA_MEM_BANKS            (1 << OPENCV_CUDA_LOG_MEM_BANKS)
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // swap
+
+    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
+    {
+        const T temp = a;
+        a = b;
+        b = temp;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Mask Reader
+
+    struct SingleMask
+    {
+        explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
+        __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            return mask.ptr(y)[x] != 0;
+        }
+
+        PtrStepb mask;
+    };
+
+    struct SingleMaskChannels
+    {
+        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
+        : mask(mask_), channels(channels_) {}
+        __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
+            :mask(mask_.mask), channels(mask_.channels){}
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            return mask.ptr(y)[x / channels] != 0;
+        }
+
+        PtrStepb mask;
+        int channels;
+    };
+
+    struct MaskCollection
+    {
+        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_)
+            : maskCollection(maskCollection_) {}
+
+        __device__ __forceinline__ MaskCollection(const MaskCollection& masks_)
+            : maskCollection(masks_.maskCollection), curMask(masks_.curMask){}
+
+        __device__ __forceinline__ void next()
+        {
+            curMask = *maskCollection++;
+        }
+        __device__ __forceinline__ void setMask(int z)
+        {
+            curMask = maskCollection[z];
+        }
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            uchar val;
+            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
+        }
+
+        const PtrStepb* maskCollection;
+        PtrStepb curMask;
+    };
+
+    struct WithOutMask
+    {
+        __host__ __device__ __forceinline__ WithOutMask(){}
+        __host__ __device__ __forceinline__ WithOutMask(const WithOutMask&){}
+
+        __device__ __forceinline__ void next() const
+        {
+        }
+        __device__ __forceinline__ void setMask(int) const
+        {
+        }
+
+        __device__ __forceinline__ bool operator()(int, int) const
+        {
+            return true;
+        }
+
+        __device__ __forceinline__ bool operator()(int, int, int) const
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int)
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int, int)
+        {
+            return true;
+        }
+    };
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Solve linear system
+
+    // solve 2x2 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
+    {
+        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
+
+            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
+
+            return true;
+        }
+
+        return false;
+    }
+
+    // solve 3x3 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
+    {
+        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet *
+                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
+                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
+
+            x[1] = saturate_cast<T>(invdet *
+                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
+                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
+
+            x[2] = saturate_cast<T>(invdet *
+                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
+                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
+                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
+
+            return true;
+        }
+
+        return false;
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_UTILITY_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/vec_distance.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/vec_distance.hpp
@@ -0,0 +1,232 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_DISTANCE_HPP
+#define OPENCV_CUDA_VEC_DISTANCE_HPP
+
+#include "reduce.hpp"
+#include "functional.hpp"
+#include "detail/vec_distance_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct L1Dist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum = __sad(val1, val2, mySum);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+    template <> struct L1Dist<float>
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            mySum += ::fabs(val1 - val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return mySum;
+        }
+
+        float mySum;
+    };
+
+    struct L2Dist
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            float reg = val1 - val2;
+            mySum += reg * reg;
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return sqrtf(mySum);
+        }
+
+        float mySum;
+    };
+
+    struct HammingDist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ HammingDist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum += __popc(val1 ^ val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2>
+    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        for (int i = tid; i < len; i += THREAD_DIM)
+        {
+            T1 val1;
+            ForceGlob<T1>::Load(vec1, i, val1);
+
+            T2 val2;
+            ForceGlob<T2>::Load(vec2, i, val2);
+
+            dist.reduceIter(val1, val2);
+        }
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
+    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
+    {
+        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
+        {
+            vec1 = vec1_;
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+        }
+
+        const T1* vec1;
+    };
+
+    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
+    {
+        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+        {
+            if (glob_tid < len)
+                smem[glob_tid] = vec1[glob_tid];
+            __syncthreads();
+
+            U* vec1ValsPtr = vec1Vals;
+
+            #pragma unroll
+            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
+                *vec1ValsPtr++ = smem[i];
+
+            __syncthreads();
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
+        }
+
+        U vec1Vals[MAX_LEN / THREAD_DIM];
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_DISTANCE_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/vec_math.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/vec_math.hpp
@@ -0,0 +1,923 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VECMATH_HPP
+#define OPENCV_CUDA_VECMATH_HPP
+
+#include "vec_traits.hpp"
+#include "saturate_cast.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+
+// saturate_cast
+
+namespace vec_math_detail
+{
+    template <int cn, typename VecD> struct SatCastHelper;
+    template <typename VecD> struct SatCastHelper<1, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<2, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<3, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<4, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+        }
+    };
+
+    template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_helper(const VecS& v)
+    {
+        return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+    }
+}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+// unary operators
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(op (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(op (a.x), op (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(op (a.x), op (a.y), op (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(op (a.x), op (a.y), op (a.z), op (a.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_OP
+
+// unary functions
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x), func (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x), func (a.y), func (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrt, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::exp, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::log, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::cosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
+
+// binary operators (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP
+
+// binary operators (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(op, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op s); \
+    } \
+    __device__ __forceinline__ output_type ## 1 operator op(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(s op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op s, a.y op s); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(s op b.x, s op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op s, a.y op s, a.z op s); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(s op b.x, s op b.y, s op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op s, a.y op s, a.z op s, a.w op s); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(s op b.x, s op b.y, s op b.z, s op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uint, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP
+
+// binary function (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x, b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x, b.x), func (a.y, b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z), func (a.w, b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmaxf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmax, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fminf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fmin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypot, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC
+
+// binary function (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(func_name, func, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) a.x, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 1 func_name(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) s, (output_type) b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s), func ((output_type) a.w, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z), func ((output_type) s, (output_type) b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC
+
+}}} // namespace cv { namespace cuda { namespace device
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VECMATH_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/vec_traits.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/vec_traits.hpp
@@ -0,0 +1,288 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_TRAITS_HPP
+#define OPENCV_CUDA_VEC_TRAITS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<typename T, int N> struct TypeVec;
+
+    struct __align__(8) uchar8
+    {
+        uchar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+    {
+        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(8) char8
+    {
+        schar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+    {
+        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) ushort8
+    {
+        ushort a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+    {
+        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) short8
+    {
+        short a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+    {
+        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) uint8
+    {
+        uint a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+    {
+        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) int8
+    {
+        int a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+    {
+        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) float8
+    {
+        float a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+    {
+        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct double8
+    {
+        double a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+    {
+        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_TYPE_VEC(type) \
+    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
+    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
+    template<> struct TypeVec<type, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type ## 2, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
+    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
+
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(uchar)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(char)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(ushort)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(short)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(int)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(uint)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(float)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(double)
+
+    #undef OPENCV_CUDA_IMPLEMENT_TYPE_VEC
+
+    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
+    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
+    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
+    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
+
+    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
+    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
+    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
+    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
+
+    template<typename T> struct VecTraits;
+
+#define OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(type) \
+    template<> struct VecTraits<type> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
+        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type make(const type* v) {return *v;} \
+    }; \
+    template<> struct VecTraits<type ## 1> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
+    }; \
+    template<> struct VecTraits<type ## 2> \
+    { \
+        typedef type elem_type; \
+        enum {cn=2}; \
+        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
+    }; \
+    template<> struct VecTraits<type ## 3> \
+    { \
+        typedef type elem_type; \
+        enum {cn=3}; \
+        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
+    }; \
+    template<> struct VecTraits<type ## 4> \
+    { \
+        typedef type elem_type; \
+        enum {cn=4}; \
+        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
+    }; \
+    template<> struct VecTraits<type ## 8> \
+    { \
+        typedef type elem_type; \
+        enum {cn=8}; \
+        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
+    };
+
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(uchar)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(ushort)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(short)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(int)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(uint)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(float)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(double)
+
+    #undef OPENCV_CUDA_IMPLEMENT_VEC_TRAITS
+
+    template<> struct VecTraits<char>
+    {
+        typedef char elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char all(char v) {return v;}
+        static __device__ __host__ __forceinline__ char make(char x) {return x;}
+        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
+    };
+    template<> struct VecTraits<schar>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
+        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
+    };
+    template<> struct VecTraits<char1>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
+        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
+        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
+    };
+    template<> struct VecTraits<char2>
+    {
+        typedef schar elem_type;
+        enum {cn=2};
+        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
+        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
+        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+    };
+    template<> struct VecTraits<char3>
+    {
+        typedef schar elem_type;
+        enum {cn=3};
+        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
+        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+    };
+    template<> struct VecTraits<char4>
+    {
+        typedef schar elem_type;
+        enum {cn=4};
+        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
+        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+    };
+    template<> struct VecTraits<char8>
+    {
+        typedef schar elem_type;
+        enum {cn=8};
+        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
+        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
+        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_TRAITS_HPP
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/warp.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/warp.hpp
@@ -0,0 +1,139 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_WARP_HPP
+#define OPENCV_CUDA_DEVICE_WARP_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Warp
+    {
+        enum
+        {
+            LOG_WARP_SIZE = 5,
+            WARP_SIZE     = 1 << LOG_WARP_SIZE,
+            STRIDE        = WARP_SIZE
+        };
+
+        /** \brief Returns the warp lane ID of the calling thread. */
+        static __device__ __forceinline__ unsigned int laneId()
+        {
+            unsigned int ret;
+            asm("mov.u32 %0, %%laneid;" : "=r"(ret) );
+            return ret;
+        }
+
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {
+            for(It t = beg + laneId(); t < end; t += STRIDE)
+                *t = value;
+        }
+
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = *t;
+            return out;
+        }
+
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = op(*t);
+            return out;
+        }
+
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            unsigned int lane = laneId();
+
+            InIt1 t1 = beg1 + lane;
+            InIt2 t2 = beg2 + lane;
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
+                *out = op(*t1, *t2);
+            return out;
+        }
+
+        template <class T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T *ptr, BinOp op)
+        {
+            const unsigned int lane = laneId();
+
+            if (lane < 16)
+            {
+                T partial = ptr[lane];
+
+                ptr[lane] = partial = op(partial, ptr[lane + 16]);
+                ptr[lane] = partial = op(partial, ptr[lane + 8]);
+                ptr[lane] = partial = op(partial, ptr[lane + 4]);
+                ptr[lane] = partial = op(partial, ptr[lane + 2]);
+                ptr[lane] = partial = op(partial, ptr[lane + 1]);
+            }
+
+            return *ptr;
+        }
+
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            unsigned int lane = laneId();
+            value += lane;
+
+            for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_DEVICE_WARP_HPP */
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/warp_reduce.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/warp_reduce.hpp
@@ -40,21 +40,37 @@
 //
 //M*/

-#ifndef OPENCV_OLD_CV_HPP
-#define OPENCV_OLD_CV_HPP
+#ifndef OPENCV_CUDA_WARP_REDUCE_HPP__
+#define OPENCV_CUDA_WARP_REDUCE_HPP__

-//#if defined(__GNUC__)
-//#warning "This is a deprecated opencv header provided for compatibility. Please include a header from a corresponding opencv module"
-//#endif
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */

-#include "cv.h"
-#include "opencv2/core.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/photo.hpp"
-#include "opencv2/video.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/features2d.hpp"
-#include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"
+//! @cond IGNORED

-#endif
+namespace cv { namespace cuda { namespace device
+{
+    template <class T>
+    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
+    {
+        const unsigned int lane = tid & 31; // index of thread in warp (0..31)
+
+        if (lane < 16)
+        {
+            T partial = ptr[tid];
+
+            ptr[tid] = partial = partial + ptr[tid + 16];
+            ptr[tid] = partial = partial + ptr[tid + 8];
+            ptr[tid] = partial = partial + ptr[tid + 4];
+            ptr[tid] = partial = partial + ptr[tid + 2];
+            ptr[tid] = partial = partial + ptr[tid + 1];
+        }
+
+        return ptr[tid - lane];
+    }
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_WARP_REDUCE_HPP__ */
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cuda/warp_shuffle.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cuda/warp_shuffle.hpp
@@ -0,0 +1,162 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_WARP_SHUFFLE_HPP
+#define OPENCV_CUDA_WARP_SHUFFLE_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+#if __CUDACC_VER_MAJOR__ >= 9
+#  define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
+#endif
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl((int) val, srcLane, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_down((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl_up(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_up((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_up(lo, delta, width);
+        hi = __shfl_up(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+
+#  undef __shfl
+#  undef __shfl_up
+#  undef __shfl_down
+
+//! @endcond
+
+#endif // OPENCV_CUDA_WARP_SHUFFLE_HPP
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/cuda_stream_accessor.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/cuda_stream_accessor.hpp
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/cuda_types.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/cuda_types.hpp
@@ -47,6 +47,13 @@
 #  error cuda_types.hpp header must be compiled as C++
 #endif

+#if defined(__OPENCV_BUILD) && defined(__clang__)
+#pragma clang diagnostic ignored "-Winconsistent-missing-override"
+#endif
+#if defined(__OPENCV_BUILD) && defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wsuggest-override"
+#endif
+
 /** @file
 * @deprecated Use @ref cudev instead.
 */
@@ -120,10 +127,12 @@ namespace cv
        };

        typedef PtrStepSz<unsigned char> PtrStepSzb;
+        typedef PtrStepSz<unsigned short> PtrStepSzus;
        typedef PtrStepSz<float> PtrStepSzf;
        typedef PtrStepSz<int> PtrStepSzi;

        typedef PtrStep<unsigned char> PtrStepb;
+        typedef PtrStep<unsigned short> PtrStepus;
        typedef PtrStep<float> PtrStepf;
        typedef PtrStep<int> PtrStepi;

--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/cv_cpu_dispatch.h
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/cv_cpu_dispatch.h
@@ -15,6 +15,7 @@
 #define CV_CPU_OPTIMIZATION_NAMESPACE cpu_baseline
 #define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace cpu_baseline {
 #define CV_CPU_OPTIMIZATION_NAMESPACE_END }
+#define CV_CPU_BASELINE_MODE 1
 #endif


@@ -82,6 +83,14 @@
 #  include <immintrin.h>
 #  define CV_AVX2 1
 #endif
+#ifdef CV_CPU_COMPILE_AVX_512F
+#  include <immintrin.h>
+#  define CV_AVX_512F 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_SKX
+#  include <immintrin.h>
+#  define CV_AVX512_SKX 1
+#endif
 #ifdef CV_CPU_COMPILE_FMA3
 #  define CV_FMA3 1
 #endif
@@ -99,7 +108,7 @@
 #  include <arm_neon.h>
 #endif

-#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
+#ifdef CV_CPU_COMPILE_VSX
 #  include <altivec.h>
 #  undef vector
 #  undef pixel
@@ -107,16 +116,24 @@
 #  define CV_VSX 1
 #endif

+#ifdef CV_CPU_COMPILE_VSX3
+#  define CV_VSX3 1
+#endif
+
 #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__

 #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
 struct VZeroUpperGuard {
+#ifdef __GNUC__
+    __attribute__((always_inline))
+#endif
+    inline VZeroUpperGuard() { _mm256_zeroupper(); }
 #ifdef __GNUC__
    __attribute__((always_inline))
 #endif
    inline ~VZeroUpperGuard() { _mm256_zeroupper(); }
 };
-#define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard; (void)__vzeroupper_guard;
+#define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard; CV_UNUSED(__vzeroupper_guard);
 #endif

 #ifdef __CV_AVX_GUARD
@@ -218,6 +235,9 @@ struct VZeroUpperGuard {
 #ifndef CV_AVX_512VL
 #  define CV_AVX_512VL 0
 #endif
+#ifndef CV_AVX512_SKX
+#  define CV_AVX512_SKX 0
+#endif

 #ifndef CV_NEON
 #  define CV_NEON 0
@@ -226,3 +246,7 @@ struct VZeroUpperGuard {
 #ifndef CV_VSX
 #  define CV_VSX 0
 #endif
+
+#ifndef CV_VSX3
+#  define CV_VSX3 0
+#endif
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/cv_cpu_helper.h
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/cv_cpu_helper.h
@@ -2,198 +2,339 @@

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
 #  define CV_TRY_SSE 1
+#  define CV_CPU_FORCE_SSE 1
 #  define CV_CPU_HAS_SUPPORT_SSE 1
-#  define CV_CPU_CALL_SSE(fn, args) return (opt_SSE::fn args)
+#  define CV_CPU_CALL_SSE(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE_(fn, args) return (opt_SSE::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
 #  define CV_TRY_SSE 1
+#  define CV_CPU_FORCE_SSE 0
 #  define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
 #  define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
+#  define CV_CPU_CALL_SSE_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
 #else
 #  define CV_TRY_SSE 0
+#  define CV_CPU_FORCE_SSE 0
 #  define CV_CPU_HAS_SUPPORT_SSE 0
 #  define CV_CPU_CALL_SSE(fn, args)
+#  define CV_CPU_CALL_SSE_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_SSE(fn, args, mode, ...)  CV_CPU_CALL_SSE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
 #  define CV_TRY_SSE2 1
+#  define CV_CPU_FORCE_SSE2 1
 #  define CV_CPU_HAS_SUPPORT_SSE2 1
-#  define CV_CPU_CALL_SSE2(fn, args) return (opt_SSE2::fn args)
+#  define CV_CPU_CALL_SSE2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE2_(fn, args) return (opt_SSE2::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
 #  define CV_TRY_SSE2 1
+#  define CV_CPU_FORCE_SSE2 0
 #  define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
 #  define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
+#  define CV_CPU_CALL_SSE2_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
 #else
 #  define CV_TRY_SSE2 0
+#  define CV_CPU_FORCE_SSE2 0
 #  define CV_CPU_HAS_SUPPORT_SSE2 0
 #  define CV_CPU_CALL_SSE2(fn, args)
+#  define CV_CPU_CALL_SSE2_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_SSE2(fn, args, mode, ...)  CV_CPU_CALL_SSE2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
 #  define CV_TRY_SSE3 1
+#  define CV_CPU_FORCE_SSE3 1
 #  define CV_CPU_HAS_SUPPORT_SSE3 1
-#  define CV_CPU_CALL_SSE3(fn, args) return (opt_SSE3::fn args)
+#  define CV_CPU_CALL_SSE3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE3_(fn, args) return (opt_SSE3::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
 #  define CV_TRY_SSE3 1
+#  define CV_CPU_FORCE_SSE3 0
 #  define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
 #  define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
+#  define CV_CPU_CALL_SSE3_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
 #else
 #  define CV_TRY_SSE3 0
+#  define CV_CPU_FORCE_SSE3 0
 #  define CV_CPU_HAS_SUPPORT_SSE3 0
 #  define CV_CPU_CALL_SSE3(fn, args)
+#  define CV_CPU_CALL_SSE3_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_SSE3(fn, args, mode, ...)  CV_CPU_CALL_SSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
 #  define CV_TRY_SSSE3 1
+#  define CV_CPU_FORCE_SSSE3 1
 #  define CV_CPU_HAS_SUPPORT_SSSE3 1
-#  define CV_CPU_CALL_SSSE3(fn, args) return (opt_SSSE3::fn args)
+#  define CV_CPU_CALL_SSSE3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSSE3_(fn, args) return (opt_SSSE3::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
 #  define CV_TRY_SSSE3 1
+#  define CV_CPU_FORCE_SSSE3 0
 #  define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
 #  define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
+#  define CV_CPU_CALL_SSSE3_(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
 #else
 #  define CV_TRY_SSSE3 0
+#  define CV_CPU_FORCE_SSSE3 0
 #  define CV_CPU_HAS_SUPPORT_SSSE3 0
 #  define CV_CPU_CALL_SSSE3(fn, args)
+#  define CV_CPU_CALL_SSSE3_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_SSSE3(fn, args, mode, ...)  CV_CPU_CALL_SSSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
 #  define CV_TRY_SSE4_1 1
+#  define CV_CPU_FORCE_SSE4_1 1
 #  define CV_CPU_HAS_SUPPORT_SSE4_1 1
-#  define CV_CPU_CALL_SSE4_1(fn, args) return (opt_SSE4_1::fn args)
+#  define CV_CPU_CALL_SSE4_1(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args) return (opt_SSE4_1::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
 #  define CV_TRY_SSE4_1 1
+#  define CV_CPU_FORCE_SSE4_1 0
 #  define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
 #  define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
 #else
 #  define CV_TRY_SSE4_1 0
+#  define CV_CPU_FORCE_SSE4_1 0
 #  define CV_CPU_HAS_SUPPORT_SSE4_1 0
 #  define CV_CPU_CALL_SSE4_1(fn, args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_SSE4_1(fn, args, mode, ...)  CV_CPU_CALL_SSE4_1(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
 #  define CV_TRY_SSE4_2 1
+#  define CV_CPU_FORCE_SSE4_2 1
 #  define CV_CPU_HAS_SUPPORT_SSE4_2 1
-#  define CV_CPU_CALL_SSE4_2(fn, args) return (opt_SSE4_2::fn args)
+#  define CV_CPU_CALL_SSE4_2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args) return (opt_SSE4_2::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
 #  define CV_TRY_SSE4_2 1
+#  define CV_CPU_FORCE_SSE4_2 0
 #  define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
 #  define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
 #else
 #  define CV_TRY_SSE4_2 0
+#  define CV_CPU_FORCE_SSE4_2 0
 #  define CV_CPU_HAS_SUPPORT_SSE4_2 0
 #  define CV_CPU_CALL_SSE4_2(fn, args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_SSE4_2(fn, args, mode, ...)  CV_CPU_CALL_SSE4_2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
 #  define CV_TRY_POPCNT 1
+#  define CV_CPU_FORCE_POPCNT 1
 #  define CV_CPU_HAS_SUPPORT_POPCNT 1
-#  define CV_CPU_CALL_POPCNT(fn, args) return (opt_POPCNT::fn args)
+#  define CV_CPU_CALL_POPCNT(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_POPCNT_(fn, args) return (opt_POPCNT::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
 #  define CV_TRY_POPCNT 1
+#  define CV_CPU_FORCE_POPCNT 0
 #  define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
 #  define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
+#  define CV_CPU_CALL_POPCNT_(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
 #else
 #  define CV_TRY_POPCNT 0
+#  define CV_CPU_FORCE_POPCNT 0
 #  define CV_CPU_HAS_SUPPORT_POPCNT 0
 #  define CV_CPU_CALL_POPCNT(fn, args)
+#  define CV_CPU_CALL_POPCNT_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_POPCNT(fn, args, mode, ...)  CV_CPU_CALL_POPCNT(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
 #  define CV_TRY_AVX 1
+#  define CV_CPU_FORCE_AVX 1
 #  define CV_CPU_HAS_SUPPORT_AVX 1
-#  define CV_CPU_CALL_AVX(fn, args) return (opt_AVX::fn args)
+#  define CV_CPU_CALL_AVX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX_(fn, args) return (opt_AVX::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
 #  define CV_TRY_AVX 1
+#  define CV_CPU_FORCE_AVX 0
 #  define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
 #  define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
+#  define CV_CPU_CALL_AVX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
 #else
 #  define CV_TRY_AVX 0
+#  define CV_CPU_FORCE_AVX 0
 #  define CV_CPU_HAS_SUPPORT_AVX 0
 #  define CV_CPU_CALL_AVX(fn, args)
+#  define CV_CPU_CALL_AVX_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_AVX(fn, args, mode, ...)  CV_CPU_CALL_AVX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
 #  define CV_TRY_FP16 1
+#  define CV_CPU_FORCE_FP16 1
 #  define CV_CPU_HAS_SUPPORT_FP16 1
-#  define CV_CPU_CALL_FP16(fn, args) return (opt_FP16::fn args)
+#  define CV_CPU_CALL_FP16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_FP16_(fn, args) return (opt_FP16::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
 #  define CV_TRY_FP16 1
+#  define CV_CPU_FORCE_FP16 0
 #  define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
 #  define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
+#  define CV_CPU_CALL_FP16_(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
 #else
 #  define CV_TRY_FP16 0
+#  define CV_CPU_FORCE_FP16 0
 #  define CV_CPU_HAS_SUPPORT_FP16 0
 #  define CV_CPU_CALL_FP16(fn, args)
+#  define CV_CPU_CALL_FP16_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_FP16(fn, args, mode, ...)  CV_CPU_CALL_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
 #  define CV_TRY_AVX2 1
+#  define CV_CPU_FORCE_AVX2 1
 #  define CV_CPU_HAS_SUPPORT_AVX2 1
-#  define CV_CPU_CALL_AVX2(fn, args) return (opt_AVX2::fn args)
+#  define CV_CPU_CALL_AVX2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX2_(fn, args) return (opt_AVX2::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
 #  define CV_TRY_AVX2 1
+#  define CV_CPU_FORCE_AVX2 0
 #  define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
 #  define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
+#  define CV_CPU_CALL_AVX2_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
 #else
 #  define CV_TRY_AVX2 0
+#  define CV_CPU_FORCE_AVX2 0
 #  define CV_CPU_HAS_SUPPORT_AVX2 0
 #  define CV_CPU_CALL_AVX2(fn, args)
+#  define CV_CPU_CALL_AVX2_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_AVX2(fn, args, mode, ...)  CV_CPU_CALL_AVX2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
 #  define CV_TRY_FMA3 1
+#  define CV_CPU_FORCE_FMA3 1
 #  define CV_CPU_HAS_SUPPORT_FMA3 1
-#  define CV_CPU_CALL_FMA3(fn, args) return (opt_FMA3::fn args)
+#  define CV_CPU_CALL_FMA3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_FMA3_(fn, args) return (opt_FMA3::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
 #  define CV_TRY_FMA3 1
+#  define CV_CPU_FORCE_FMA3 0
 #  define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
 #  define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
+#  define CV_CPU_CALL_FMA3_(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
 #else
 #  define CV_TRY_FMA3 0
+#  define CV_CPU_FORCE_FMA3 0
 #  define CV_CPU_HAS_SUPPORT_FMA3 0
 #  define CV_CPU_CALL_FMA3(fn, args)
+#  define CV_CPU_CALL_FMA3_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...)  CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX_512F
+#  define CV_TRY_AVX_512F 1
+#  define CV_CPU_FORCE_AVX_512F 1
+#  define CV_CPU_HAS_SUPPORT_AVX_512F 1
+#  define CV_CPU_CALL_AVX_512F(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args) return (opt_AVX_512F::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX_512F
+#  define CV_TRY_AVX_512F 1
+#  define CV_CPU_FORCE_AVX_512F 0
+#  define CV_CPU_HAS_SUPPORT_AVX_512F (cv::checkHardwareSupport(CV_CPU_AVX_512F))
+#  define CV_CPU_CALL_AVX_512F(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args)
+#else
+#  define CV_TRY_AVX_512F 0
+#  define CV_CPU_FORCE_AVX_512F 0
+#  define CV_CPU_HAS_SUPPORT_AVX_512F 0
+#  define CV_CPU_CALL_AVX_512F(fn, args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX_512F(fn, args, mode, ...)  CV_CPU_CALL_AVX_512F(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_SKX
+#  define CV_TRY_AVX512_SKX 1
+#  define CV_CPU_FORCE_AVX512_SKX 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX 1
+#  define CV_CPU_CALL_AVX512_SKX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args) return (opt_AVX512_SKX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX
+#  define CV_TRY_AVX512_SKX 1
+#  define CV_CPU_FORCE_AVX512_SKX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX (cv::checkHardwareSupport(CV_CPU_AVX512_SKX))
+#  define CV_CPU_CALL_AVX512_SKX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args)
+#else
+#  define CV_TRY_AVX512_SKX 0
+#  define CV_CPU_FORCE_AVX512_SKX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX 0
+#  define CV_CPU_CALL_AVX512_SKX(fn, args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_SKX(fn, args, mode, ...)  CV_CPU_CALL_AVX512_SKX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
 #  define CV_TRY_NEON 1
+#  define CV_CPU_FORCE_NEON 1
 #  define CV_CPU_HAS_SUPPORT_NEON 1
-#  define CV_CPU_CALL_NEON(fn, args) return (opt_NEON::fn args)
+#  define CV_CPU_CALL_NEON(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_(fn, args) return (opt_NEON::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
 #  define CV_TRY_NEON 1
+#  define CV_CPU_FORCE_NEON 0
 #  define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
 #  define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
+#  define CV_CPU_CALL_NEON_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
 #else
 #  define CV_TRY_NEON 0
+#  define CV_CPU_FORCE_NEON 0
 #  define CV_CPU_HAS_SUPPORT_NEON 0
 #  define CV_CPU_CALL_NEON(fn, args)
+#  define CV_CPU_CALL_NEON_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...)  CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX
 #  define CV_TRY_VSX 1
+#  define CV_CPU_FORCE_VSX 1
 #  define CV_CPU_HAS_SUPPORT_VSX 1
-#  define CV_CPU_CALL_VSX(fn, args) return (opt_VSX::fn args)
+#  define CV_CPU_CALL_VSX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_VSX_(fn, args) return (opt_VSX::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX
 #  define CV_TRY_VSX 1
+#  define CV_CPU_FORCE_VSX 0
 #  define CV_CPU_HAS_SUPPORT_VSX (cv::checkHardwareSupport(CV_CPU_VSX))
 #  define CV_CPU_CALL_VSX(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
+#  define CV_CPU_CALL_VSX_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
 #else
 #  define CV_TRY_VSX 0
+#  define CV_CPU_FORCE_VSX 0
 #  define CV_CPU_HAS_SUPPORT_VSX 0
 #  define CV_CPU_CALL_VSX(fn, args)
+#  define CV_CPU_CALL_VSX_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_VSX(fn, args, mode, ...)  CV_CPU_CALL_VSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX3
+#  define CV_TRY_VSX3 1
+#  define CV_CPU_FORCE_VSX3 1
+#  define CV_CPU_HAS_SUPPORT_VSX3 1
+#  define CV_CPU_CALL_VSX3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_VSX3_(fn, args) return (opt_VSX3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX3
+#  define CV_TRY_VSX3 1
+#  define CV_CPU_FORCE_VSX3 0
+#  define CV_CPU_HAS_SUPPORT_VSX3 (cv::checkHardwareSupport(CV_CPU_VSX3))
+#  define CV_CPU_CALL_VSX3(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args)
+#  define CV_CPU_CALL_VSX3_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args)
+#else
+#  define CV_TRY_VSX3 0
+#  define CV_CPU_FORCE_VSX3 0
+#  define CV_CPU_HAS_SUPPORT_VSX3 0
+#  define CV_CPU_CALL_VSX3(fn, args)
+#  define CV_CPU_CALL_VSX3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_VSX3(fn, args, mode, ...)  CV_CPU_CALL_VSX3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
 #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...)  CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cvdef.h
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cvdef.h
@@ -0,0 +1,857 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CVDEF_H
+#define OPENCV_CORE_CVDEF_H
+
+//! @addtogroup core_utils
+//! @{
+
+#if !defined CV_DOXYGEN && !defined CV_IGNORE_DEBUG_BUILD_GUARD
+#if (defined(_MSC_VER) && (defined(DEBUG) || defined(_DEBUG))) || \
+    (defined(_GLIBCXX_DEBUG) || defined(_GLIBCXX_DEBUG_PEDANTIC))
+// Guard to prevent using of binary incompatible binaries / runtimes
+// https://github.com/opencv/opencv/pull/9161
+#define CV__DEBUG_NS_BEGIN namespace debug_build_guard {
+#define CV__DEBUG_NS_END }
+namespace cv { namespace debug_build_guard { } using namespace debug_build_guard; }
+#endif
+#endif
+
+#ifndef CV__DEBUG_NS_BEGIN
+#define CV__DEBUG_NS_BEGIN
+#define CV__DEBUG_NS_END
+#endif
+
+
+#ifdef __OPENCV_BUILD
+#include "cvconfig.h"
+#endif
+
+#ifndef __CV_EXPAND
+#define __CV_EXPAND(x) x
+#endif
+
+#ifndef __CV_CAT
+#define __CV_CAT__(x, y) x ## y
+#define __CV_CAT_(x, y) __CV_CAT__(x, y)
+#define __CV_CAT(x, y) __CV_CAT_(x, y)
+#endif
+
+#define __CV_VA_NUM_ARGS_HELPER(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+#define __CV_VA_NUM_ARGS(...) __CV_EXPAND(__CV_VA_NUM_ARGS_HELPER(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+
+#if defined __GNUC__
+#define CV_Func __func__
+#elif defined _MSC_VER
+#define CV_Func __FUNCTION__
+#else
+#define CV_Func ""
+#endif
+
+//! @cond IGNORED
+
+//////////////// static assert /////////////////
+#define CVAUX_CONCAT_EXP(a, b) a##b
+#define CVAUX_CONCAT(a, b) CVAUX_CONCAT_EXP(a,b)
+
+#if defined(__clang__)
+#  ifndef __has_extension
+#    define __has_extension __has_feature /* compatibility, for older versions of clang */
+#  endif
+#  if __has_extension(cxx_static_assert)
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  elif __has_extension(c_static_assert)
+#    define CV_StaticAssert(condition, reason)    _Static_assert((condition), reason " " #condition)
+#  endif
+#elif defined(__GNUC__)
+#  if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  endif
+#elif defined(_MSC_VER)
+#  if _MSC_VER >= 1600 /* MSVC 10 */
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  endif
+#endif
+#ifndef CV_StaticAssert
+#  if !defined(__clang__) && defined(__GNUC__) && (__GNUC__*100 + __GNUC_MINOR__ > 302)
+#    define CV_StaticAssert(condition, reason) ({ extern int __attribute__((error("CV_StaticAssert: " reason " " #condition))) CV_StaticAssert(); ((condition) ? 0 : CV_StaticAssert()); })
+#  else
+     template <bool x> struct CV_StaticAssert_failed;
+     template <> struct CV_StaticAssert_failed<true> { enum { val = 1 }; };
+     template<int x> struct CV_StaticAssert_test {};
+#    define CV_StaticAssert(condition, reason)\
+       typedef cv::CV_StaticAssert_test< sizeof(cv::CV_StaticAssert_failed< static_cast<bool>(condition) >) > CVAUX_CONCAT(CV_StaticAssert_failed_at_, __LINE__)
+#  endif
+#endif
+
+// Suppress warning "-Wdeprecated-declarations" / C4996
+#if defined(_MSC_VER)
+    #define CV_DO_PRAGMA(x) __pragma(x)
+#elif defined(__GNUC__)
+    #define CV_DO_PRAGMA(x) _Pragma (#x)
+#else
+    #define CV_DO_PRAGMA(x)
+#endif
+
+#ifdef _MSC_VER
+#define CV_SUPPRESS_DEPRECATED_START \
+    CV_DO_PRAGMA(warning(push)) \
+    CV_DO_PRAGMA(warning(disable: 4996))
+#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(warning(pop))
+#elif defined (__clang__) || ((__GNUC__)  && (__GNUC__*100 + __GNUC_MINOR__ > 405))
+#define CV_SUPPRESS_DEPRECATED_START \
+    CV_DO_PRAGMA(GCC diagnostic push) \
+    CV_DO_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
+#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(GCC diagnostic pop)
+#else
+#define CV_SUPPRESS_DEPRECATED_START
+#define CV_SUPPRESS_DEPRECATED_END
+#endif
+
+#define CV_UNUSED(name) (void)name
+
+//! @endcond
+
+// undef problematic defines sometimes defined by system headers (windows.h in particular)
+#undef small
+#undef min
+#undef max
+#undef abs
+#undef Complex
+
+#include <limits.h>
+#include "opencv2/core/hal/interface.h"
+
+#if defined __ICL
+#  define CV_ICC   __ICL
+#elif defined __ICC
+#  define CV_ICC   __ICC
+#elif defined __ECL
+#  define CV_ICC   __ECL
+#elif defined __ECC
+#  define CV_ICC   __ECC
+#elif defined __INTEL_COMPILER
+#  define CV_ICC   __INTEL_COMPILER
+#endif
+
+#ifndef CV_INLINE
+#  if defined __cplusplus
+#    define CV_INLINE static inline
+#  elif defined _MSC_VER
+#    define CV_INLINE __inline
+#  else
+#    define CV_INLINE static
+#  endif
+#endif
+
+#ifndef CV_ALWAYS_INLINE
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define CV_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define CV_ALWAYS_INLINE __forceinline
+#else
+#define CV_ALWAYS_INLINE inline
+#endif
+#endif
+
+#if defined CV_DISABLE_OPTIMIZATION || (defined CV_ICC && !defined CV_ENABLE_UNROLLED)
+#  define CV_ENABLE_UNROLLED 0
+#else
+#  define CV_ENABLE_UNROLLED 1
+#endif
+
+#ifdef __GNUC__
+#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined _MSC_VER
+#  define CV_DECL_ALIGNED(x) __declspec(align(x))
+#else
+#  define CV_DECL_ALIGNED(x)
+#endif
+
+/* CPU features and intrinsics support */
+#define CV_CPU_NONE             0
+#define CV_CPU_MMX              1
+#define CV_CPU_SSE              2
+#define CV_CPU_SSE2             3
+#define CV_CPU_SSE3             4
+#define CV_CPU_SSSE3            5
+#define CV_CPU_SSE4_1           6
+#define CV_CPU_SSE4_2           7
+#define CV_CPU_POPCNT           8
+#define CV_CPU_FP16             9
+#define CV_CPU_AVX              10
+#define CV_CPU_AVX2             11
+#define CV_CPU_FMA3             12
+
+#define CV_CPU_AVX_512F         13
+#define CV_CPU_AVX_512BW        14
+#define CV_CPU_AVX_512CD        15
+#define CV_CPU_AVX_512DQ        16
+#define CV_CPU_AVX_512ER        17
+#define CV_CPU_AVX_512IFMA512   18 // deprecated
+#define CV_CPU_AVX_512IFMA      18
+#define CV_CPU_AVX_512PF        19
+#define CV_CPU_AVX_512VBMI      20
+#define CV_CPU_AVX_512VL        21
+
+#define CV_CPU_NEON             100
+
+#define CV_CPU_VSX              200
+#define CV_CPU_VSX3             201
+
+// CPU features groups
+#define CV_CPU_AVX512_SKX       256
+
+// when adding to this list remember to update the following enum
+#define CV_HARDWARE_MAX_FEATURE 512
+
+/** @brief Available CPU features.
+*/
+enum CpuFeatures {
+    CPU_MMX             = 1,
+    CPU_SSE             = 2,
+    CPU_SSE2            = 3,
+    CPU_SSE3            = 4,
+    CPU_SSSE3           = 5,
+    CPU_SSE4_1          = 6,
+    CPU_SSE4_2          = 7,
+    CPU_POPCNT          = 8,
+    CPU_FP16            = 9,
+    CPU_AVX             = 10,
+    CPU_AVX2            = 11,
+    CPU_FMA3            = 12,
+
+    CPU_AVX_512F        = 13,
+    CPU_AVX_512BW       = 14,
+    CPU_AVX_512CD       = 15,
+    CPU_AVX_512DQ       = 16,
+    CPU_AVX_512ER       = 17,
+    CPU_AVX_512IFMA512  = 18, // deprecated
+    CPU_AVX_512IFMA     = 18,
+    CPU_AVX_512PF       = 19,
+    CPU_AVX_512VBMI     = 20,
+    CPU_AVX_512VL       = 21,
+
+    CPU_NEON            = 100,
+
+    CPU_VSX             = 200,
+    CPU_VSX3            = 201,
+
+    CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
+
+    CPU_MAX_FEATURE     = 512  // see CV_HARDWARE_MAX_FEATURE
+};
+
+
+#include "cv_cpu_dispatch.h"
+
+
+/* fundamental constants */
+#define CV_PI   3.1415926535897932384626433832795
+#define CV_2PI  6.283185307179586476925286766559
+#define CV_LOG2 0.69314718055994530941723212145818
+
+#if defined __ARM_FP16_FORMAT_IEEE \
+    && !defined __CUDACC__
+#  define CV_FP16_TYPE 1
+#else
+#  define CV_FP16_TYPE 0
+#endif
+
+typedef union Cv16suf
+{
+    short i;
+    ushort u;
+#if CV_FP16_TYPE
+    __fp16 h;
+#endif
+}
+Cv16suf;
+
+typedef union Cv32suf
+{
+    int i;
+    unsigned u;
+    float f;
+}
+Cv32suf;
+
+typedef union Cv64suf
+{
+    int64 i;
+    uint64 u;
+    double f;
+}
+Cv64suf;
+
+#define OPENCV_ABI_COMPATIBILITY 400
+
+#ifdef __OPENCV_BUILD
+#  define DISABLE_OPENCV_3_COMPATIBILITY
+#  define OPENCV_DISABLE_DEPRECATED_COMPATIBILITY
+#endif
+
+#ifdef CVAPI_EXPORTS
+# if (defined _WIN32 || defined WINCE || defined __CYGWIN__)
+#   define CV_EXPORTS __declspec(dllexport)
+# elif defined __GNUC__ && __GNUC__ >= 4
+#   define CV_EXPORTS __attribute__ ((visibility ("default")))
+# endif
+#endif
+
+#ifndef CV_EXPORTS
+# define CV_EXPORTS
+#endif
+
+#ifdef _MSC_VER
+#   define CV_EXPORTS_TEMPLATE
+#else
+#   define CV_EXPORTS_TEMPLATE CV_EXPORTS
+#endif
+
+#ifndef CV_DEPRECATED
+#  if defined(__GNUC__)
+#    define CV_DEPRECATED __attribute__ ((deprecated))
+#  elif defined(_MSC_VER)
+#    define CV_DEPRECATED __declspec(deprecated)
+#  else
+#    define CV_DEPRECATED
+#  endif
+#endif
+
+#ifndef CV_DEPRECATED_EXTERNAL
+#  if defined(__OPENCV_BUILD)
+#    define CV_DEPRECATED_EXTERNAL /* nothing */
+#  else
+#    define CV_DEPRECATED_EXTERNAL CV_DEPRECATED
+#  endif
+#endif
+
+
+#ifndef CV_EXTERN_C
+#  ifdef __cplusplus
+#    define CV_EXTERN_C extern "C"
+#  else
+#    define CV_EXTERN_C
+#  endif
+#endif
+
+/* special informative macros for wrapper generators */
+#define CV_EXPORTS_W CV_EXPORTS
+#define CV_EXPORTS_W_SIMPLE CV_EXPORTS
+#define CV_EXPORTS_AS(synonym) CV_EXPORTS
+#define CV_EXPORTS_W_MAP CV_EXPORTS
+#define CV_IN_OUT
+#define CV_OUT
+#define CV_PROP
+#define CV_PROP_RW
+#define CV_WRAP
+#define CV_WRAP_AS(synonym)
+#define CV_WRAP_MAPPABLE(mappable)
+#define CV_WRAP_PHANTOM(phantom_header)
+#define CV_WRAP_DEFAULT(val)
+
+/****************************************************************************************\
+*                                  Matrix type (Mat)                                     *
+\****************************************************************************************/
+
+#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
+#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
+#define CV_MAT_TYPE_MASK        (CV_DEPTH_MAX*CV_CN_MAX - 1)
+#define CV_MAT_TYPE(flags)      ((flags) & CV_MAT_TYPE_MASK)
+#define CV_MAT_CONT_FLAG_SHIFT  14
+#define CV_MAT_CONT_FLAG        (1 << CV_MAT_CONT_FLAG_SHIFT)
+#define CV_IS_MAT_CONT(flags)   ((flags) & CV_MAT_CONT_FLAG)
+#define CV_IS_CONT_MAT          CV_IS_MAT_CONT
+#define CV_SUBMAT_FLAG_SHIFT    15
+#define CV_SUBMAT_FLAG          (1 << CV_SUBMAT_FLAG_SHIFT)
+#define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)
+
+/** Size of each channel item,
+   0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
+
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
+
+#ifndef MIN
+#  define MIN(a,b)  ((a) > (b) ? (b) : (a))
+#endif
+
+#ifndef MAX
+#  define MAX(a,b)  ((a) < (b) ? (b) : (a))
+#endif
+
+///////////////////////////////////////// Enum operators ///////////////////////////////////////
+
+/**
+
+Provides compatibility operators for both classical and C++11 enum classes,
+as well as exposing the C++11 enum class members for backwards compatibility
+
+@code
+    // Provides operators required for flag enums
+    CV_ENUM_FLAGS(AccessFlag)
+
+    // Exposes the listed members of the enum class AccessFlag to the current namespace
+    CV_ENUM_CLASS_EXPOSE(AccessFlag, ACCESS_READ [, ACCESS_WRITE [, ...] ]);
+@endcode
+*/
+
+#define __CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST)                                              \
+static const EnumType MEMBER_CONST = EnumType::MEMBER_CONST;                                          \
+
+#define __CV_ENUM_CLASS_EXPOSE_2(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_1(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_3(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_2(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_4(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_3(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_5(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_4(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_6(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_5(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_7(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_6(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_8(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_7(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_9(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_8(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_FLAGS_LOGICAL_NOT(EnumType)                                                         \
+static inline bool operator!(const EnumType& val)                                                     \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return !static_cast<UnderlyingType>(val);                                                         \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_LOGICAL_NOT_EQ(Arg1Type, Arg2Type)                                            \
+static inline bool operator!=(const Arg1Type& a, const Arg2Type& b)                                   \
+{                                                                                                     \
+    return static_cast<int>(a) != static_cast<int>(b);                                                \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_LOGICAL_EQ(Arg1Type, Arg2Type)                                                \
+static inline bool operator==(const Arg1Type& a, const Arg2Type& b)                                   \
+{                                                                                                     \
+    return static_cast<int>(a) == static_cast<int>(b);                                                \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_NOT(EnumType)                                                         \
+static inline EnumType operator~(const EnumType& val)                                                 \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(~static_cast<UnderlyingType>(val));                                  \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_OR(EnumType, Arg1Type, Arg2Type)                                      \
+static inline EnumType operator|(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) | static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_AND(EnumType, Arg1Type, Arg2Type)                                     \
+static inline EnumType operator&(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) & static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_XOR(EnumType, Arg1Type, Arg2Type)                                     \
+static inline EnumType operator^(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) ^ static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_OR_EQ(EnumType, Arg1Type)                                             \
+static inline EnumType& operator|=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) | static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_AND_EQ(EnumType, Arg1Type)                                            \
+static inline EnumType& operator&=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) & static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_XOR_EQ(EnumType, Arg1Type)                                            \
+static inline EnumType& operator^=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) ^ static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define CV_ENUM_CLASS_EXPOSE(EnumType, ...)                                                           \
+__CV_EXPAND(__CV_CAT(__CV_ENUM_CLASS_EXPOSE_, __CV_VA_NUM_ARGS(__VA_ARGS__))(EnumType, __VA_ARGS__)); \
+
+#define CV_ENUM_FLAGS(EnumType)                                                                       \
+__CV_ENUM_FLAGS_LOGICAL_NOT      (EnumType)                                                           \
+__CV_ENUM_FLAGS_LOGICAL_EQ       (EnumType, int)                                                      \
+__CV_ENUM_FLAGS_LOGICAL_NOT_EQ   (EnumType, int)                                                      \
+                                                                                                      \
+__CV_ENUM_FLAGS_BITWISE_NOT      (EnumType)                                                           \
+__CV_ENUM_FLAGS_BITWISE_OR       (EnumType, EnumType, EnumType)                                       \
+__CV_ENUM_FLAGS_BITWISE_AND      (EnumType, EnumType, EnumType)                                       \
+__CV_ENUM_FLAGS_BITWISE_XOR      (EnumType, EnumType, EnumType)                                       \
+                                                                                                      \
+__CV_ENUM_FLAGS_BITWISE_OR_EQ    (EnumType, EnumType)                                                 \
+__CV_ENUM_FLAGS_BITWISE_AND_EQ   (EnumType, EnumType)                                                 \
+__CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)                                                 \
+
+/****************************************************************************************\
+*                                    static analysys                                     *
+\****************************************************************************************/
+
+// In practice, some macro are not processed correctly (noreturn is not detected).
+// We need to use simplified definition for them.
+#ifndef CV_STATIC_ANALYSIS
+# if defined(__KLOCWORK__) || defined(__clang_analyzer__) || defined(__COVERITY__)
+#   define CV_STATIC_ANALYSIS 1
+# endif
+#else
+# if defined(CV_STATIC_ANALYSIS) && !(__CV_CAT(1, CV_STATIC_ANALYSIS) == 1)  // defined and not empty
+#   if 0 == CV_STATIC_ANALYSIS
+#     undef CV_STATIC_ANALYSIS
+#   endif
+# endif
+#endif
+
+/****************************************************************************************\
+*                                    Thread sanitizer                                    *
+\****************************************************************************************/
+#ifndef CV_THREAD_SANITIZER
+# if defined(__has_feature)
+#   if __has_feature(thread_sanitizer)
+#     define CV_THREAD_SANITIZER
+#   endif
+# endif
+#endif
+
+/****************************************************************************************\
+*          exchange-add operation for atomic operations on reference counters            *
+\****************************************************************************************/
+
+#ifdef CV_XADD
+  // allow to use user-defined macro
+#elif defined __GNUC__ || defined __clang__
+#  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)  && !defined __INTEL_COMPILER
+#    ifdef __ATOMIC_ACQ_REL
+#      define CV_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#    else
+#      define CV_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#    endif
+#  else
+#    if defined __ATOMIC_ACQ_REL && !defined __clang__
+       // version for gcc >= 4.7
+#      define CV_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#    else
+#      define CV_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#    endif
+#  endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#  include <intrin.h>
+#  define CV_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+   CV_INLINE CV_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
+#endif
+
+
+/****************************************************************************************\
+*                                  CV_NORETURN attribute                                 *
+\****************************************************************************************/
+
+#ifndef CV_NORETURN
+#  if defined(__GNUC__)
+#    define CV_NORETURN __attribute__((__noreturn__))
+#  elif defined(_MSC_VER) && (_MSC_VER >= 1300)
+#    define CV_NORETURN __declspec(noreturn)
+#  else
+#    define CV_NORETURN /* nothing by default */
+#  endif
+#endif
+
+
+/****************************************************************************************\
+*                                  CV_NODISCARD attribute                                *
+* encourages the compiler to issue a warning if the return value is discarded (C++17)    *
+\****************************************************************************************/
+#ifndef CV_NODISCARD
+#  if defined(__GNUC__)
+#    define CV_NODISCARD __attribute__((__warn_unused_result__)) // at least available with GCC 3.4
+#  elif defined(__clang__) && defined(__has_attribute)
+#    if __has_attribute(__warn_unused_result__)
+#      define CV_NODISCARD __attribute__((__warn_unused_result__))
+#    endif
+#  endif
+#endif
+#ifndef CV_NODISCARD
+#  define CV_NODISCARD /* nothing by default */
+#endif
+
+
+/****************************************************************************************\
+*                                    C++ 11                                              *
+\****************************************************************************************/
+#ifndef CV_CXX11
+#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#    define CV_CXX11 1
+#  endif
+#else
+#  if CV_CXX11 == 0
+#    undef CV_CXX11
+#  endif
+#endif
+#ifndef CV_CXX11
+#  error "OpenCV 4.x+ requires enabled C++11 support"
+#endif
+
+#define CV_CXX_MOVE_SEMANTICS 1
+#define CV_CXX_STD_ARRAY 1
+#include <array>
+#ifndef CV_OVERRIDE
+#  define CV_OVERRIDE override
+#endif
+#ifndef CV_FINAL
+#  define CV_FINAL final
+#endif
+
+#ifndef CV_NOEXCEPT
+#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
+#    define CV_NOEXCEPT noexcept
+#  endif
+#endif
+#ifndef CV_NOEXCEPT
+#  define CV_NOEXCEPT
+#endif
+
+#ifndef CV_CONSTEXPR
+#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
+#    define CV_CONSTEXPR constexpr
+#  endif
+#endif
+#ifndef CV_CONSTEXPR
+#  define CV_CONSTEXPR
+#endif
+
+// Integer types portatibility
+#ifdef OPENCV_STDINT_HEADER
+#include OPENCV_STDINT_HEADER
+#elif defined(__cplusplus)
+#if defined(_MSC_VER) && _MSC_VER < 1600 /* MSVS 2010 */
+namespace cv {
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef signed short int16_t;
+typedef unsigned short uint16_t;
+typedef signed int int32_t;
+typedef unsigned int uint32_t;
+typedef signed __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+}
+#elif defined(_MSC_VER) || __cplusplus >= 201103L
+#include <cstdint>
+namespace cv {
+using std::int8_t;
+using std::uint8_t;
+using std::int16_t;
+using std::uint16_t;
+using std::int32_t;
+using std::uint32_t;
+using std::int64_t;
+using std::uint64_t;
+}
+#else
+#include <stdint.h>
+namespace cv {
+typedef ::int8_t int8_t;
+typedef ::uint8_t uint8_t;
+typedef ::int16_t int16_t;
+typedef ::uint16_t uint16_t;
+typedef ::int32_t int32_t;
+typedef ::uint32_t uint32_t;
+typedef ::int64_t int64_t;
+typedef ::uint64_t uint64_t;
+}
+#endif
+#else // pure C
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+namespace cv
+{
+
+class float16_t
+{
+public:
+#if CV_FP16_TYPE
+
+    float16_t() : h(0) {}
+    explicit float16_t(float x) { h = (__fp16)x; }
+    operator float() const { return (float)h; }
+    static float16_t fromBits(ushort w)
+    {
+        Cv16suf u;
+        u.u = w;
+        float16_t result;
+        result.h = u.h;
+        return result;
+    }
+    static float16_t zero()
+    {
+        float16_t result;
+        result.h = (__fp16)0;
+        return result;
+    }
+    ushort bits() const
+    {
+        Cv16suf u;
+        u.h = h;
+        return u.u;
+    }
+protected:
+    __fp16 h;
+
+#else
+    float16_t() : w(0) {}
+    explicit float16_t(float x)
+    {
+    #if CV_AVX2
+        __m128 v = _mm_load_ss(&x);
+        w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
+    #else
+        Cv32suf in;
+        in.f = x;
+        unsigned sign = in.u & 0x80000000;
+        in.u ^= sign;
+
+        if( in.u >= 0x47800000 )
+            w = (ushort)(in.u > 0x7f800000 ? 0x7e00 : 0x7c00);
+        else
+        {
+            if (in.u < 0x38800000)
+            {
+                in.f += 0.5f;
+                w = (ushort)(in.u - 0x3f000000);
+            }
+            else
+            {
+                unsigned t = in.u + 0xc8000fff;
+                w = (ushort)((t + ((in.u >> 13) & 1)) >> 13);
+            }
+        }
+
+        w = (ushort)(w | (sign >> 16));
+    #endif
+    }
+
+    operator float() const
+    {
+    #if CV_AVX2
+        float f;
+        _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
+        return f;
+    #else
+        Cv32suf out;
+
+        unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+        unsigned sign = (w & 0x8000) << 16;
+        unsigned e = w & 0x7c00;
+
+        out.u = t + (1 << 23);
+        out.u = (e >= 0x7c00 ? t + 0x38000000 :
+                 e == 0 ? (static_cast<void>(out.f -= 6.103515625e-05f), out.u) : t) | sign;
+        return out.f;
+    #endif
+    }
+
+    static float16_t fromBits(ushort b)
+    {
+        float16_t result;
+        result.w = b;
+        return result;
+    }
+    static float16_t zero()
+    {
+        float16_t result;
+        result.w = (ushort)0;
+        return result;
+    }
+    ushort bits() const { return w; }
+protected:
+    ushort w;
+
+#endif
+};
+
+}
+#endif
+
+//! @}
+
+#ifndef __cplusplus
+#include "opencv2/core/fast_math.hpp" // define cvRound(double)
+#endif
+
+#endif // OPENCV_CORE_CVDEF_H
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cvstd.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cvstd.hpp
@@ -0,0 +1,190 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CVSTD_HPP
+#define OPENCV_CORE_CVSTD_HPP
+
+#ifndef __cplusplus
+#  error cvstd.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+
+#include <string>
+
+// import useful primitives from stl
+#  include <algorithm>
+#  include <utility>
+#  include <cstdlib> //for abs(int)
+#  include <cmath>
+
+namespace cv
+{
+    static inline uchar abs(uchar a) { return a; }
+    static inline ushort abs(ushort a) { return a; }
+    static inline unsigned abs(unsigned a) { return a; }
+    static inline uint64 abs(uint64 a) { return a; }
+
+    using std::min;
+    using std::max;
+    using std::abs;
+    using std::swap;
+    using std::sqrt;
+    using std::exp;
+    using std::pow;
+    using std::log;
+}
+
+#include "cvstd_wrapper.hpp"
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+//////////////////////////// memory management functions ////////////////////////////
+
+/** @brief Allocates an aligned memory buffer.
+
+The function allocates the buffer of the specified size and returns it. When the buffer size is 16
+bytes or more, the returned buffer is aligned to 16 bytes.
+@param bufSize Allocated buffer size.
+ */
+CV_EXPORTS void* fastMalloc(size_t bufSize);
+
+/** @brief Deallocates a memory buffer.
+
+The function deallocates the buffer allocated with fastMalloc . If NULL pointer is passed, the
+function does nothing. C version of the function clears the pointer *pptr* to avoid problems with
+double memory deallocation.
+@param ptr Pointer to the allocated buffer.
+ */
+CV_EXPORTS void fastFree(void* ptr);
+
+/*!
+  The STL-compliant memory Allocator based on cv::fastMalloc() and cv::fastFree()
+*/
+template<typename _Tp> class Allocator
+{
+public:
+    typedef _Tp value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+    template<typename U> class rebind { typedef Allocator<U> other; };
+
+    explicit Allocator() {}
+    ~Allocator() {}
+    explicit Allocator(Allocator const&) {}
+    template<typename U>
+    explicit Allocator(Allocator<U> const&) {}
+
+    // address
+    pointer address(reference r) { return &r; }
+    const_pointer address(const_reference r) { return &r; }
+
+    pointer allocate(size_type count, const void* =0) { return reinterpret_cast<pointer>(fastMalloc(count * sizeof (_Tp))); }
+    void deallocate(pointer p, size_type) { fastFree(p); }
+
+    void construct(pointer p, const _Tp& v) { new(static_cast<void*>(p)) _Tp(v); }
+    void destroy(pointer p) { p->~_Tp(); }
+
+    size_type max_size() const { return cv::max(static_cast<_Tp>(-1)/sizeof(_Tp), 1); }
+};
+
+//! @} core_utils
+
+//! @endcond
+
+//! @addtogroup core_basic
+//! @{
+
+//////////////////////////////// string class ////////////////////////////////
+
+class CV_EXPORTS FileNode; //for string constructor from FileNode
+
+typedef std::string String;
+
+#ifndef OPENCV_DISABLE_STRING_LOWER_UPPER_CONVERSIONS
+
+//! @cond IGNORED
+namespace details {
+// std::tolower is int->int
+static inline char char_tolower(char ch)
+{
+    return (char)std::tolower((int)ch);
+}
+// std::toupper is int->int
+static inline char char_toupper(char ch)
+{
+    return (char)std::toupper((int)ch);
+}
+} // namespace details
+//! @endcond
+
+static inline std::string toLowerCase(const std::string& str)
+{
+    std::string result(str);
+    std::transform(result.begin(), result.end(), result.begin(), details::char_tolower);
+    return result;
+}
+
+static inline std::string toUpperCase(const std::string& str)
+{
+    std::string result(str);
+    std::transform(result.begin(), result.end(), result.begin(), details::char_toupper);
+    return result;
+}
+
+#endif // OPENCV_DISABLE_STRING_LOWER_UPPER_CONVERSIONS
+
+//! @} core_basic
+} // cv
+
+#endif //OPENCV_CORE_CVSTD_HPP
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/cvstd.inl.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/cvstd.inl.hpp
@@ -73,95 +73,6 @@ public:
    typedef Vec<channel_type, channels> vec_type;
 };

-inline
-String::String(const std::string& str)
-    : cstr_(0), len_(0)
-{
-    if (!str.empty())
-    {
-        size_t len = str.size();
-        if (len) memcpy(allocate(len), str.c_str(), len);
-    }
-}
-
-inline
-String::String(const std::string& str, size_t pos, size_t len)
-    : cstr_(0), len_(0)
-{
-    size_t strlen = str.size();
-    pos = min(pos, strlen);
-    len = min(strlen - pos, len);
-    if (!len) return;
-    memcpy(allocate(len), str.c_str() + pos, len);
-}
-
-inline
-String& String::operator = (const std::string& str)
-{
-    deallocate();
-    if (!str.empty())
-    {
-        size_t len = str.size();
-        if (len) memcpy(allocate(len), str.c_str(), len);
-    }
-    return *this;
-}
-
-inline
-String& String::operator += (const std::string& str)
-{
-    *this = *this + str;
-    return *this;
-}
-
-inline
-String::operator std::string() const
-{
-    return std::string(cstr_, len_);
-}
-
-inline
-String operator + (const String& lhs, const std::string& rhs)
-{
-    String s;
-    size_t rhslen = rhs.size();
-    s.allocate(lhs.len_ + rhslen);
-    if (lhs.len_) memcpy(s.cstr_, lhs.cstr_, lhs.len_);
-    if (rhslen) memcpy(s.cstr_ + lhs.len_, rhs.c_str(), rhslen);
-    return s;
-}
-
-inline
-String operator + (const std::string& lhs, const String& rhs)
-{
-    String s;
-    size_t lhslen = lhs.size();
-    s.allocate(lhslen + rhs.len_);
-    if (lhslen) memcpy(s.cstr_, lhs.c_str(), lhslen);
-    if (rhs.len_) memcpy(s.cstr_ + lhslen, rhs.cstr_, rhs.len_);
-    return s;
-}
-
-inline
-FileNode::operator std::string() const
-{
-    String value;
-    read(*this, value, value);
-    return value;
-}
-
-template<> inline
-void operator >> (const FileNode& n, std::string& value)
-{
-    read(n, value, std::string());
-}
-
-template<> inline
-FileStorage& operator << (FileStorage& fs, const std::string& value)
-{
-    return fs << cv::String(value);
-}
-
 static inline
 std::ostream& operator << (std::ostream& os, const String& str)
 {
@@ -265,16 +176,21 @@ std::ostream& operator << (std::ostream& out, const Rect_<_Tp>& rect)

 static inline std::ostream& operator << (std::ostream& out, const MatSize& msize)
 {
-    int i, dims = msize.p[-1];
+    int i, dims = msize.dims();
    for( i = 0; i < dims; i++ )
    {
-        out << msize.p[i];
+        out << msize[i];
        if( i < dims-1 )
            out << " x ";
    }
    return out;
 }

+static inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
+{
+    return s << "[" << r.start << " : " << r.end << ")";
+}
+
 } // cv

 #ifdef _MSC_VER
--- a/lib/3rdParty/OpenCV/include/opencv2/core/cvstd_wrapper.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/cvstd_wrapper.hpp
@@ -0,0 +1,149 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CVSTD_WRAPPER_HPP
+#define OPENCV_CORE_CVSTD_WRAPPER_HPP
+
+#include "opencv2/core/cvdef.h"
+
+#include <string>
+#include <memory>  // std::shared_ptr
+#include <type_traits>  // std::enable_if
+
+namespace cv {
+
+using std::nullptr_t;
+
+//! @addtogroup core_basic
+//! @{
+
+#ifdef CV_DOXYGEN
+
+template <typename _Tp> using Ptr = std::shared_ptr<_Tp>;  // In ideal world it should look like this, but we need some compatibility workarounds below
+
+template<typename _Tp, typename ... A1> static inline
+Ptr<_Tp> makePtr(const A1&... a1) { return std::make_shared<_Tp>(a1...); }
+
+#else  // cv::Ptr with compatibility workarounds
+
+// It should be defined for C-API types only.
+// C++ types should use regular "delete" operator.
+template<typename Y> struct DefaultDeleter;
+#if 0
+{
+    void operator()(Y* p) const;
+};
+#endif
+
+namespace sfinae {
+template<typename C, typename Ret, typename... Args>
+struct has_parenthesis_operator
+{
+private:
+    template<typename T>
+    static CV_CONSTEXPR std::true_type check(typename std::is_same<typename std::decay<decltype(std::declval<T>().operator()(std::declval<Args>()...))>::type, Ret>::type*);
+
+    template<typename> static CV_CONSTEXPR std::false_type check(...);
+
+    typedef decltype(check<C>(0)) type;
+
+public:
+    static CV_CONSTEXPR bool value = type::value;
+};
+} // namespace sfinae
+
+template <typename T, typename = void>
+struct has_custom_delete
+        : public std::false_type {};
+
+// Force has_custom_delete to std::false_type when NVCC is compiling CUDA source files
+#ifndef __CUDACC__
+template <typename T>
+struct has_custom_delete<T, typename std::enable_if< sfinae::has_parenthesis_operator<DefaultDeleter<T>, void, T*>::value >::type >
+        : public std::true_type {};
+#endif
+
+template<typename T>
+struct Ptr : public std::shared_ptr<T>
+{
+#if 0
+    using std::shared_ptr<T>::shared_ptr;  // GCC 5.x can't handle this
+#else
+    inline Ptr() CV_NOEXCEPT : std::shared_ptr<T>() {}
+    inline Ptr(nullptr_t) CV_NOEXCEPT : std::shared_ptr<T>(nullptr) {}
+    template<typename Y, typename D> inline Ptr(Y* p, D d) : std::shared_ptr<T>(p, d) {}
+    template<typename D> inline Ptr(nullptr_t, D d) : std::shared_ptr<T>(nullptr, d) {}
+
+    template<typename Y> inline Ptr(const Ptr<Y>& r, T* ptr) CV_NOEXCEPT : std::shared_ptr<T>(r, ptr) {}
+
+    inline Ptr(const Ptr<T>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    inline Ptr(Ptr<T>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+
+    template<typename Y> inline Ptr(const Ptr<Y>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    template<typename Y> inline Ptr(Ptr<Y>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+#endif
+    inline Ptr(const std::shared_ptr<T>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    inline Ptr(std::shared_ptr<T>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+
+    // Overload with custom DefaultDeleter: Ptr<IplImage>(...)
+    template<typename Y>
+    inline Ptr(const std::true_type&, Y* ptr) : std::shared_ptr<T>(ptr, DefaultDeleter<Y>()) {}
+
+    // Overload without custom deleter: Ptr<std::string>(...);
+    template<typename Y>
+    inline Ptr(const std::false_type&, Y* ptr) : std::shared_ptr<T>(ptr) {}
+
+    template<typename Y = T>
+    inline Ptr(Y* ptr) : Ptr(has_custom_delete<Y>(), ptr) {}
+
+    // Overload with custom DefaultDeleter: Ptr<IplImage>(...)
+    template<typename Y>
+    inline void reset(const std::true_type&, Y* ptr) { std::shared_ptr<T>::reset(ptr, DefaultDeleter<Y>()); }
+
+    // Overload without custom deleter: Ptr<std::string>(...);
+    template<typename Y>
+    inline void reset(const std::false_type&, Y* ptr) { std::shared_ptr<T>::reset(ptr); }
+
+    template<typename Y>
+    inline void reset(Y* ptr) { Ptr<T>::reset(has_custom_delete<Y>(), ptr); }
+
+    template<class Y, class Deleter>
+    void reset(Y* ptr, Deleter d) { std::shared_ptr<T>::reset(ptr, d); }
+
+    void reset() CV_NOEXCEPT { std::shared_ptr<T>::reset(); }
+
+    Ptr& operator=(const Ptr& o) { std::shared_ptr<T>::operator =(o); return *this; }
+    template<typename Y> inline Ptr& operator=(const Ptr<Y>& o) { std::shared_ptr<T>::operator =(o); return *this; }
+
+    T* operator->() const CV_NOEXCEPT { return std::shared_ptr<T>::get();}
+    typename std::add_lvalue_reference<T>::type operator*() const CV_NOEXCEPT { return *std::shared_ptr<T>::get(); }
+
+    // OpenCV 3.x methods (not a part of standard C++ library)
+    inline void release() { std::shared_ptr<T>::reset(); }
+    inline operator T* () const { return std::shared_ptr<T>::get(); }
+    inline bool empty() const { return std::shared_ptr<T>::get() == nullptr; }
+
+    template<typename Y> inline
+    Ptr<Y> staticCast() const CV_NOEXCEPT { return std::static_pointer_cast<Y>(*this); }
+
+    template<typename Y> inline
+    Ptr<Y> constCast() const CV_NOEXCEPT { return std::const_pointer_cast<Y>(*this); }
+
+    template<typename Y> inline
+    Ptr<Y> dynamicCast() const CV_NOEXCEPT { return std::dynamic_pointer_cast<Y>(*this); }
+};
+
+template<typename _Tp, typename ... A1> static inline
+Ptr<_Tp> makePtr(const A1&... a1)
+{
+    static_assert( !has_custom_delete<_Tp>::value, "Can't use this makePtr with custom DefaultDeleter");
+    return (Ptr<_Tp>)std::make_shared<_Tp>(a1...);
+}
+
+#endif // CV_DOXYGEN
+
+//! @} core_basic
+} // cv
+
+#endif //OPENCV_CORE_CVSTD_WRAPPER_HPP
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/directx.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/directx.hpp
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/eigen.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/eigen.hpp
@@ -60,7 +60,7 @@ namespace cv
 //! @{

 template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
-void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src, Mat& dst )
+void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src, OutputArray dst )
 {
    if( !(src.Flags & Eigen::RowMajorBit) )
    {
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/fast_math.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/fast_math.hpp
@@ -70,16 +70,12 @@
 #  endif
 #endif

-#ifdef HAVE_TEGRA_OPTIMIZATION
-#  include "tegra_round.hpp"
-#endif
-
 #if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__)
    // 1. general scheme
    #define ARM_ROUND(_value, _asm_string) \
        int res; \
        float temp; \
-        (void)temp; \
+        CV_UNUSED(temp); \
        __asm__(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
        return res
    // 2. version for double
@@ -112,9 +108,6 @@ cvRound( double value )
        fistp t;
    }
    return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
-        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_DBL(value);
 #elif defined CV_ICC || defined __GNUC__
 # if defined ARM_ROUND_DBL
    ARM_ROUND_DBL(value);
@@ -200,9 +193,6 @@ CV_INLINE int cvRound(float value)
        fistp t;
    }
    return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
-        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_FLT(value);
 #elif defined CV_ICC || defined __GNUC__
 # if defined ARM_ROUND_FLT
    ARM_ROUND_FLT(value);
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/hal.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/hal.hpp
@@ -195,6 +195,12 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );

+CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+
+CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
+CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
+
 struct CV_EXPORTS DFT1D
 {
    static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/interface.h
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/interface.h
@@ -64,6 +64,8 @@ typedef signed char schar;
 #  define CV_BIG_UINT(n)  n##ULL
 #endif

+#define CV_USRTYPE1 (void)"CV_USRTYPE1 support has been dropped in OpenCV 4.0"
+
 #define CV_CN_MAX     512
 #define CV_CN_SHIFT   3
 #define CV_DEPTH_MAX  (1 << CV_CN_SHIFT)
@@ -75,7 +77,7 @@ typedef signed char schar;
 #define CV_32S  4
 #define CV_32F  5
 #define CV_64F  6
-#define CV_USRTYPE1 7
+#define CV_16F  7

 #define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
 #define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
@@ -124,6 +126,12 @@ typedef signed char schar;
 #define CV_64FC3 CV_MAKETYPE(CV_64F,3)
 #define CV_64FC4 CV_MAKETYPE(CV_64F,4)
 #define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+
+#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
+#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
+#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
+#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
+#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
 //! @}

 //! @name Comparison operation
--- a/lib/3rdParty/OpenCV/include/opencv2/core/hal/intrin.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/hal/intrin.hpp
@@ -0,0 +1,429 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_INTRIN_HPP
+#define OPENCV_HAL_INTRIN_HPP
+
+#include <cmath>
+#include <float.h>
+#include <stdlib.h>
+#include "opencv2/core/cvdef.h"
+
+#define OPENCV_HAL_ADD(a, b) ((a) + (b))
+#define OPENCV_HAL_AND(a, b) ((a) & (b))
+#define OPENCV_HAL_NOP(a) (a)
+#define OPENCV_HAL_1ST(a, b) (a)
+
+// unlike HAL API, which is in cv::hal,
+// we put intrinsics into cv namespace to make its
+// access from within opencv code more accessible
+namespace cv {
+
+namespace hal {
+
+enum StoreMode
+{
+    STORE_UNALIGNED = 0,
+    STORE_ALIGNED = 1,
+    STORE_ALIGNED_NOCACHE = 2
+};
+
+}
+
+template<typename _Tp> struct V_TypeTraits
+{
+};
+
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
+    template<> struct V_TypeTraits<type> \
+    { \
+        typedef type value_type; \
+        typedef int_type_ int_type; \
+        typedef abs_type_ abs_type; \
+        typedef uint_type_ uint_type; \
+        typedef w_type_ w_type; \
+        typedef q_type_ q_type; \
+        typedef sum_type_ sum_type; \
+        enum { nlanes128 = nlanes128_ }; \
+    \
+        static inline int_type reinterpret_int(type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.l = x; \
+            return v.i; \
+        } \
+    \
+        static inline type reinterpret_from_int(int_type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.i = x; \
+            return v.l; \
+        } \
+    }
+
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2);
+
+#ifndef CV_DOXYGEN
+
+#ifdef CV_CPU_DISPATCH_MODE
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#else
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#endif
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
+#endif
+}
+
+#ifdef CV_DOXYGEN
+#   undef CV_AVX2
+#   undef CV_SSE2
+#   undef CV_NEON
+#   undef CV_VSX
+#   undef CV_FP16
+#endif
+
+#if CV_SSE2 || CV_NEON || CV_VSX
+#define CV__SIMD_FORWARD 128
+#include "opencv2/core/hal/intrin_forward.hpp"
+#endif
+
+#if CV_SSE2
+
+#include "opencv2/core/hal/intrin_sse_em.hpp"
+#include "opencv2/core/hal/intrin_sse.hpp"
+
+#elif CV_NEON
+
+#include "opencv2/core/hal/intrin_neon.hpp"
+
+#elif CV_VSX
+
+#include "opencv2/core/hal/intrin_vsx.hpp"
+
+#else
+
+#define CV_SIMD128_CPP 1
+#include "opencv2/core/hal/intrin_cpp.hpp"
+
+#endif
+
+// AVX2 can be used together with SSE2, so
+// we define those two sets of intrinsics at once.
+// Most of the intrinsics do not conflict (the proper overloaded variant is
+// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
+// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
+// Correspondingly, the wide intrinsics (which are mapped to the "widest"
+// available instruction set) will get vx_ prefix
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
+#if CV_AVX2
+
+#define CV__SIMD_FORWARD 256
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_avx.hpp"
+
+#endif
+
+//! @cond IGNORED
+
+namespace cv {
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+#endif
+
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
+
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
+
+#ifndef CV_SIMD256
+#define CV_SIMD256 0
+#endif
+
+#ifndef CV_SIMD256_64F
+#define CV_SIMD256_64F 0
+#endif
+
+#ifndef CV_SIMD512
+#define CV_SIMD512 0
+#endif
+
+#ifndef CV_SIMD512_64F
+#define CV_SIMD512_64F 0
+#endif
+
+#ifndef CV_SIMD128_FP16
+#define CV_SIMD128_FP16 0
+#endif
+
+#ifndef CV_SIMD256_FP16
+#define CV_SIMD256_FP16 0
+#endif
+
+#ifndef CV_SIMD512_FP16
+#define CV_SIMD512_FP16 0
+#endif
+
+//==================================================================================================
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+    inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
+    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
+    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
+    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
+    inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
+    inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
+    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
+    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } \
+    inline vtyp vx_lut(const typ* ptr, const int* idx) { return prefix##_lut(ptr, idx); } \
+    inline vtyp vx_lut_pairs(const typ* ptr, const int* idx) { return prefix##_lut_pairs(ptr, idx); }
+
+#define CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
+    inline vtyp vx_lut_quads(const typ* ptr, const int* idx) { return prefix##_lut_quads(ptr, idx); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+    inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
+    inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(ushort, v_uint16, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(short, v_int16, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(int, v_int32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(unsigned, v_uint32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(float, v_float32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
+
+template<typename _Tp> struct V_RegTraits
+{
+};
+
+#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
+    template<> struct V_RegTraits<_reg> \
+    { \
+        typedef _reg reg; \
+        typedef _u_reg u_reg; \
+        typedef _w_reg w_reg; \
+        typedef _q_reg q_reg; \
+        typedef _int_reg int_reg; \
+        typedef _round_reg round_reg; \
+    }
+
+#if CV_SIMD128 || CV_SIMD128_CPP
+    CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
+    CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
+#if CV_SIMD128_64F
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
+#else
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
+#endif
+    CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
+    CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
+#if CV_SIMD128_64F
+    CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
+#endif
+#endif
+
+#if CV_SIMD256
+    CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
+    CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
+#endif
+
+#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
+#define CV__SIMD_NAMESPACE simd512
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_WIDTH 64
+    // TODO typedef v_uint8 / v_int32 / etc types here
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
+#define CV__SIMD_NAMESPACE simd256
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_FP16 CV_SIMD256_FP16
+    #define CV_SIMD_WIDTH 32
+    typedef v_uint8x32   v_uint8;
+    typedef v_int8x32    v_int8;
+    typedef v_uint16x16  v_uint16;
+    typedef v_int16x16   v_int16;
+    typedef v_uint32x8   v_uint32;
+    typedef v_int32x8    v_int32;
+    typedef v_uint64x4   v_uint64;
+    typedef v_int64x4    v_int64;
+    typedef v_float32x8  v_float32;
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
+    #if CV_SIMD256_64F
+    typedef v_float64x4  v_float64;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
+    #endif
+    inline void vx_cleanup() { v256_cleanup(); }
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
+#define CV__SIMD_NAMESPACE simd128
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD CV_SIMD128
+    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_WIDTH 16
+    typedef v_uint8x16  v_uint8;
+    typedef v_int8x16   v_int8;
+    typedef v_uint16x8  v_uint16;
+    typedef v_int16x8   v_int16;
+    typedef v_uint32x4  v_uint32;
+    typedef v_int32x4   v_int32;
+    typedef v_uint64x2  v_uint64;
+    typedef v_int64x2   v_int64;
+    typedef v_float32x4 v_float32;
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
+    #if CV_SIMD128_64F
+    typedef v_float64x2 v_float64;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
+    #endif
+    inline void vx_cleanup() { v_cleanup(); }
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#endif
+
+inline unsigned int trailingZeros32(unsigned int value) {
+#if defined(_MSC_VER)
+#if (_MSC_VER < 1700) || defined(_M_ARM)
+    unsigned long index = 0;
+    _BitScanForward(&index, value);
+    return (unsigned int)index;
+#elif defined(__clang__)
+    // clang-cl doesn't export _tzcnt_u32 for non BMI systems
+    return value ? __builtin_ctz(value) : 32;
+#else
+    return _tzcnt_u32(value);
+#endif
+#elif defined(__GNUC__) || defined(__GNUG__)
+    return __builtin_ctz(value);
+#elif defined(__ICC) || defined(__INTEL_COMPILER)
+    return _bit_scan_forward(value);
+#elif defined(__clang__)
+    return llvm.cttz.i32(value, true);
+#else
+    static const int MultiplyDeBruijnBitPosition[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
+    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
+#endif
+}
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+#endif
+
+#ifndef CV_SIMD_64F
+#define CV_SIMD_64F 0
+#endif
+
+#ifndef CV_SIMD_FP16
+#define CV_SIMD_FP16 0  //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
+#endif
+
+
+#ifndef CV_SIMD
+#define CV_SIMD 0
+#endif
+
+} // cv::
+
+//! @endcond
+
+#endif
--- a/lib/3rdParty/OpenCV/include/opencv2/core/hal/intrin_avx.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/hal/intrin_avx.hpp
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/intrin_cpp.hpp
@@ -108,8 +108,8 @@ block and to save contents of the register to memory block.
 These operations allow to reorder or recombine elements in one or multiple vectors.

 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
+- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
+- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
 - Extract: @ref v_extract
@@ -159,7 +159,7 @@ Most of these operations return only one value.
 ### Other math

 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
- Absolute values: @ref v_abs, @ref v_absdiff
+- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs

 ### Conversions

@@ -185,23 +185,29 @@ Regular integers:
 |load, store        | x | x | x | x | x | x |
 |interleave         | x | x | x | x | x | x |
 |expand             | x | x | x | x | x | x |
+|expand_low         | x | x | x | x | x | x |
+|expand_high        | x | x | x | x | x | x |
 |expand_q           | x | x |   |   |   |   |
 |add, sub           | x | x | x | x | x | x |
 |add_wrap, sub_wrap | x | x | x | x |   |   |
-|mul                |   |   | x | x | x | x |
-|mul_expand         |   |   | x | x | x |   |
+|mul_wrap           | x | x | x | x |   |   |
+|mul                | x | x | x | x | x | x |
+|mul_expand         | x | x | x | x | x |   |
 |compare            | x | x | x | x | x | x |
 |shift              |   |   | x | x | x | x |
 |dotprod            |   |   |   | x |   |   |
 |logical            | x | x | x | x | x | x |
 |min, max           | x | x | x | x | x | x |
 |absdiff            | x | x | x | x | x | x |
+|absdiffs           |   | x |   | x |   |   |
 |reduce             |   |   |   |   | x | x |
 |mask               | x | x | x | x | x | x |
 |pack               | x | x | x | x | x | x |
 |pack_u             | x |   | x |   |   |   |
+|pack_b             | x |   |   |   |   |   |
 |unpack             | x | x | x | x | x | x |
 |extract            | x | x | x | x | x | x |
+|rotate (lanes)     | x | x | x | x | x | x |
 |cvt_flt32          |   |   |   |   |   | x |
 |cvt_flt64          |   |   |   |   |   | x |
 |transpose4x4       |   |   |   |   | x | x |
@@ -215,6 +221,7 @@ Big integers:
 |shift              | x | x |
 |logical            | x | x |
 |extract            | x | x |
+|rotate (lanes)     | x | x |

 Floating point:

@@ -236,7 +243,8 @@ Floating point:
 |sqrt, abs          | x | x |
 |float math         | x | x |
 |transpose4x4       | x |   |
-
+|extract            | x | x |
+|rotate (lanes)     | x | x |

 @{ */

@@ -244,8 +252,6 @@ template<typename _Tp, int n> struct v_reg
 {
 //! @cond IGNORED
    typedef _Tp lane_type;
-    typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
-    typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
    enum { nlanes = n };
 // !@endcond

@@ -677,9 +683,28 @@ OPENCV_HAL_IMPL_CMP_OP(==)
 For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(!=)

+template<int n>
+inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
+{
+    typedef typename V_TypeTraits<float>::int_type itype;
+    v_reg<float, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+template<int n>
+inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
+{
+    typedef typename V_TypeTraits<double>::int_type itype;
+    v_reg<double, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
+#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
 template<typename _Tp, int n> \
 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
@@ -693,12 +718,17 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 /** @brief Add values without saturation

 For 8- and 16-bit integer values. */
-OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)

 /** @brief Subtract values without saturation

 For 8- and 16-bit integer values. */
-OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
+
+/** @brief Multiply values without saturation
+
+For 8- and 16-bit integer values. */
+OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)

 //! @cond IGNORED
 template<typename T> inline T _absdiff(T a, T b)
@@ -753,6 +783,19 @@ inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
    return c;
 }

+/** @brief Saturating absolute difference
+
+Returns \f$ saturate(|a - b|) \f$ .
+For 8-, 16-bit signed integer source types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++)
+        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
+    return c;
+}
+
 /** @brief Inversed square root

 Returns \f$ 1/sqrt(a) \f$
@@ -794,11 +837,11 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>

 /** @brief Multiply and add

-Returns \f$ a*b + c \f$
-For floating point types only. */
+ Returns \f$ a*b + c \f$
+ For floating point types and signed 32bit int only. */
 template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                              const v_reg<_Tp, n>& c)
+inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                           const v_reg<_Tp, n>& c)
 {
    v_reg<_Tp, n> d;
    for( int i = 0; i < n; i++ )
@@ -806,6 +849,14 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    return d;
 }

+/** @brief A synonym for v_fma */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    return v_fma(a, b, c);
+}
+
 /** @brief Dot product of elements

 Multiply values in two registers and sum adjacent result pairs.
@@ -828,6 +879,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
    return c;
 }

+/** @brief Dot product of elements
+
+Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+  {A1B1+A2B2+C1 ...} // 32-bit
+
+@endcode
+Implemented only for 16-bit signed source type (v_int16x8).
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> s;
+    for( int i = 0; i < (n/2); i++ )
+        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
+    return s;
+}
+
 /** @brief Multiply and expand

 Multiply values two registers and store results in two registers with wider pack type.
@@ -859,6 +933,20 @@ template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, c
    }
 }

+/** @brief Multiply and extract high part
+
+Multiply values two registers and store high part of the results.
+Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
+*/
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
+    return c;
+}
+
 //! @cond IGNORED
 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
@@ -975,6 +1063,21 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
    return r;
 }

+/** @brief Sum absolute differences of values
+
+Scheme:
+@code
+{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
+@endcode
+For all types except 64-bit types.*/
+template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
+    for (int i = 1; i < n; i++)
+        c += _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
 /** @brief Get negative values mask

 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
@@ -1016,13 +1119,16 @@ template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
    return false;
 }

-/** @brief Bitwise select
+/** @brief Per-element select (blend operation)

-Return value will be built by combining values a and b using the following scheme:
-If the i-th bit in _mask_ is 1
-    select i-th bit from _a_
-else
-    select i-th bit from _b_ */
+Return value will be built by combining values _a_ and _b_ using the following scheme:
+    result[i] = mask[i] ? a[i] : b[i];
+
+@note: _mask_ element values are restricted to these values:
+- 0: select element from _b_
+- 0xff/0xffff/etc: select element from _a_
+(fully compatible with bitwise-based operator)
+*/
 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
@@ -1032,8 +1138,8 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>&
    for( int i = 0; i < n; i++ )
    {
        int_type m = Traits::reinterpret_int(mask.s[i]);
-        c.s[i] =  Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m)
-                                             | (Traits::reinterpret_int(b.s[i]) & ~m));
+        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
+        c.s[i] = m ? a.s[i] : b.s[i];
    }
    return c;
 }
@@ -1057,6 +1163,44 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
    }
 }

+/** @brief Expand lower values to the wider pack type
+
+Same as cv::v_expand, but return lower half of the vector.
+
+Scheme:
+@code
+ int32x4     int64x2
+{A B C D} ==> {A B}
+@endcode */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_expand_low(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
+    for( int i = 0; i < (n/2); i++ )
+        b.s[i] = a.s[i];
+    return b;
+}
+
+/** @brief Expand higher values to the wider pack type
+
+Same as cv::v_expand_low, but expand higher half of the vector instead.
+
+Scheme:
+@code
+ int32x4     int64x2
+{A B C D} ==> {C D}
+@endcode */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_expand_high(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
+    for( int i = 0; i < (n/2); i++ )
+        b.s[i] = a.s[i+(n/2)];
+    return b;
+}
+
 //! @cond IGNORED
 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
@@ -1112,9 +1256,9 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
 */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
 {
-    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }

 /** @brief Load register contents from memory (aligned)
@@ -1122,9 +1266,9 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
 */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
 {
-    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }

 /** @brief Load 64-bits of data to lower part (high part is undefined).
@@ -1137,9 +1281,9 @@ v_int32x4 r = v_load_low(lo);
@endcode
 */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
 {
-    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
    for( int i = 0; i < c.nlanes/2; i++ )
    {
        c.s[i] = ptr[i];
@@ -1158,9 +1302,9 @@ v_int32x4 r = v_load_halves(lo, hi);
@endcode
 */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
 {
-    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
    for( int i = 0; i < c.nlanes/2; i++ )
    {
        c.s[i] = loptr[i];
@@ -1179,11 +1323,11 @@ v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
@endcode
 For 8-, 16-, 32-bit integer source types. */
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
 v_load_expand(const _Tp* ptr)
 {
    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
+    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
    for( int i = 0; i < c.nlanes; i++ )
    {
        c.s[i] = ptr[i];
@@ -1200,11 +1344,11 @@ v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
@endcode
 For 8-bit integer source types. */
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
 v_load_expand_q(const _Tp* ptr)
 {
    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
+    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
    for( int i = 0; i < c.nlanes; i++ )
    {
        c.s[i] = ptr[i];
@@ -1284,7 +1428,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b)
+                               const v_reg<_Tp, n>& b,
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i2;
    for( i = i2 = 0; i < n; i++, i2 += 2 )
@@ -1304,7 +1449,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i3;
    for( i = i3 = 0; i < n; i++, i3 += 3 )
@@ -1325,7 +1471,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d)
+                                                            const v_reg<_Tp, n>& d,
+                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i4;
    for( i = i4 = 0; i < n; i++, i4 += 4 )
@@ -1395,6 +1542,20 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
        ptr[i] = a.s[i];
 }

+template<typename _Tp, int n>
+inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
 /** @brief Combine vector from first elements of two vectors

 Scheme:
@@ -1476,7 +1637,7 @@ Usage:
 v_int32x4 a, b, c;
 c = v_extract<2>(a, b);
@endcode
-For integer types only. */
+For all types. */
 template<int s, typename _Tp, int n>
 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
@@ -1501,6 +1662,18 @@ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
    return c;
 }

+/** @overload */
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = cvRound(b.s[i]);
+    }
+    return c;
+}
+
 /** @brief Floor

 Floor each value. Input type is float vector ==> output type is int vector.*/
@@ -1593,6 +1766,17 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
    return c;
 }

+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<float, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (float)a.s[i];
+        c.s[i+n] = (float)b.s[i];
+    }
+    return c;
+}
+
 /** @brief Convert to double

 Supported input type is cv::v_int32x4. */
@@ -1615,6 +1799,124 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
    return c;
 }

+template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
+    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+        c.s[i] = tab[idx[i]];
+    return c;
+}
+template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
+    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+        c.s[i] = tab[idx[i / 2] + i % 2];
+    return c;
+}
+template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
+    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+        c.s[i] = tab[idx[i / 4] + i % 4];
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
+                                               v_reg<float, n>& x, v_reg<float, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
+                                               v_reg<double, n>& x, v_reg<double, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
+{
+    v_reg<float, n> c;
+    for (int i = 0; i < n/4; i++)
+    {
+        c.s[4*i  ] = vec.s[4*i  ];
+        c.s[4*i+1] = vec.s[4*i+2];
+        c.s[4*i+2] = vec.s[4*i+1];
+        c.s[4*i+3] = vec.s[4*i+3];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
+{
+    v_reg<float, n> c;
+    for (int i = 0; i < n/8; i++)
+    {
+        c.s[8*i  ] = vec.s[8*i  ];
+        c.s[8*i+1] = vec.s[8*i+4];
+        c.s[8*i+2] = vec.s[8*i+1];
+        c.s[8*i+3] = vec.s[8*i+5];
+        c.s[8*i+4] = vec.s[8*i+2];
+        c.s[8*i+5] = vec.s[8*i+6];
+        c.s[8*i+6] = vec.s[8*i+3];
+        c.s[8*i+7] = vec.s[8*i+7];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
+{
+    v_reg<float, n> c;
+    for (int i = 0; i < n/4; i++)
+    {
+        c.s[3*i  ] = vec.s[4*i  ];
+        c.s[3*i+1] = vec.s[4*i+1];
+        c.s[3*i+2] = vec.s[4*i+2];
+    }
+    return c;
+}
+
 /** @brief Transpose 4x4 matrix

 Scheme:
@@ -1890,6 +2192,103 @@ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, s
 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
 //! @}

+//! @cond IGNORED
+template<typename _Tpm, typename _Tp, int n>
+inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    for (int i = 0; i < n; ++i)
+    {
+        mptr[i] = (_Tpm)a.s[i];
+        mptr[i + n] = (_Tpm)b.s[i];
+    }
+}
+//! @endcond
+
+//! @name Pack boolean values
+//! @{
+//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
+//!
+//! @note Must provide valid boolean values to guarantee same result for all architectures.
+
+/** @brief
+//! For 16-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
+b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    return mask;
+}
+
+/** @overload
+For 32-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0 0 0xFFFF..}
+b  {0 0xFFFF.. 0xFFFF.. 0}
+c  {0xFFFF.. 0 0xFFFF.. 0}
+d  {0 0xFFFF.. 0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 8, c, d);
+    return mask;
+}
+
+/** @overload
+For 64-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0}
+b  {0 0xFFFF..}
+c  {0xFFFF.. 0}
+d  {0 0xFFFF..}
+
+e  {0xFFFF.. 0}
+f  {0xFFFF.. 0}
+g  {0 0xFFFF..}
+h  {0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0xFF 0 0 0xFF
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 4, c, d);
+    _pack_b(mask.s + 8, e, f);
+    _pack_b(mask.s + 12, g, h);
+    return mask;
+}
+//! @}
+
 /** @brief Matrix multiplication

 Scheme:
@@ -1939,6 +2338,30 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
 }

+////// FP16 support ///////
+
+inline v_reg<float, V_TypeTraits<float>::nlanes128>
+v_load_expand(const float16_t* ptr)
+{
+    v_reg<float, V_TypeTraits<float>::nlanes128> v;
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+
+inline void
+v_pack_store(float16_t* ptr, v_reg<float, V_TypeTraits<float>::nlanes128>& v)
+{
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        ptr[i] = float16_t(v.s[i]);
+    }
+}
+
+inline void v_cleanup() {}
+
 //! @}

 //! @name Check SIMD support
--- a/lib/3rdParty/OpenCV/include/opencv2/core/hal/intrin_forward.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/hal/intrin_forward.hpp
@@ -0,0 +1,158 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef CV__SIMD_FORWARD
+#error "Need to pre-define forward width"
+#endif
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+/** Types **/
+#if CV__SIMD_FORWARD == 512
+// [todo] 512
+#error "AVX512 Not implemented yet"
+#elif CV__SIMD_FORWARD == 256
+// 256
+#define __CV_VX(fun)   v256_##fun
+#define __CV_V_UINT8   v_uint8x32
+#define __CV_V_INT8    v_int8x32
+#define __CV_V_UINT16  v_uint16x16
+#define __CV_V_INT16   v_int16x16
+#define __CV_V_UINT32  v_uint32x8
+#define __CV_V_INT32   v_int32x8
+#define __CV_V_UINT64  v_uint64x4
+#define __CV_V_INT64   v_int64x4
+#define __CV_V_FLOAT32 v_float32x8
+#define __CV_V_FLOAT64 v_float64x4
+struct v_uint8x32;
+struct v_int8x32;
+struct v_uint16x16;
+struct v_int16x16;
+struct v_uint32x8;
+struct v_int32x8;
+struct v_uint64x4;
+struct v_int64x4;
+struct v_float32x8;
+struct v_float64x4;
+#else
+// 128
+#define __CV_VX(fun)   v_##fun
+#define __CV_V_UINT8   v_uint8x16
+#define __CV_V_INT8    v_int8x16
+#define __CV_V_UINT16  v_uint16x8
+#define __CV_V_INT16   v_int16x8
+#define __CV_V_UINT32  v_uint32x4
+#define __CV_V_INT32   v_int32x4
+#define __CV_V_UINT64  v_uint64x2
+#define __CV_V_INT64   v_int64x2
+#define __CV_V_FLOAT32 v_float32x4
+#define __CV_V_FLOAT64 v_float64x2
+struct v_uint8x16;
+struct v_int8x16;
+struct v_uint16x8;
+struct v_int16x8;
+struct v_uint32x4;
+struct v_int32x4;
+struct v_uint64x2;
+struct v_int64x2;
+struct v_float32x4;
+struct v_float64x2;
+#endif
+
+/** Value reordering **/
+
+// Expansion
+void v_expand(const __CV_V_UINT8&,  __CV_V_UINT16&, __CV_V_UINT16&);
+void v_expand(const __CV_V_INT8&,   __CV_V_INT16&,  __CV_V_INT16&);
+void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
+void v_expand(const __CV_V_INT16&,  __CV_V_INT32&,  __CV_V_INT32&);
+void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
+void v_expand(const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
+// Low Expansion
+__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&);
+__CV_V_INT16  v_expand_low(const __CV_V_INT8&);
+__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&);
+__CV_V_INT32  v_expand_low(const __CV_V_INT16&);
+__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&);
+__CV_V_INT64  v_expand_low(const __CV_V_INT32&);
+// High Expansion
+__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&);
+__CV_V_INT16  v_expand_high(const __CV_V_INT8&);
+__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&);
+__CV_V_INT32  v_expand_high(const __CV_V_INT16&);
+__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&);
+__CV_V_INT64  v_expand_high(const __CV_V_INT32&);
+// Load & Low Expansion
+__CV_V_UINT16 __CV_VX(load_expand)(const uchar*);
+__CV_V_INT16  __CV_VX(load_expand)(const schar*);
+__CV_V_UINT32 __CV_VX(load_expand)(const ushort*);
+__CV_V_INT32  __CV_VX(load_expand)(const short*);
+__CV_V_UINT64 __CV_VX(load_expand)(const uint*);
+__CV_V_INT64  __CV_VX(load_expand)(const int*);
+// Load lower 8-bit and expand into 32-bit
+__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*);
+__CV_V_INT32  __CV_VX(load_expand_q)(const schar*);
+
+// Saturating Pack
+__CV_V_UINT8  v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&);
+__CV_V_INT8   v_pack(const __CV_V_INT16&,  const __CV_V_INT16&);
+__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&);
+__CV_V_INT16  v_pack(const __CV_V_INT32&,  const __CV_V_INT32&);
+// Non-saturating Pack
+__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&);
+__CV_V_INT32  v_pack(const __CV_V_INT64&,  const __CV_V_INT64&);
+// Pack signed integers with unsigned saturation
+__CV_V_UINT8  v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&);
+__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&);
+
+/** Arithmetic, bitwise and comparison operations **/
+
+// Non-saturating multiply
+#if CV_VSX
+template<typename Tvec>
+Tvec v_mul_wrap(const Tvec& a, const Tvec& b);
+#else
+__CV_V_UINT8  v_mul_wrap(const __CV_V_UINT8&,  const __CV_V_UINT8&);
+__CV_V_INT8   v_mul_wrap(const __CV_V_INT8&,   const __CV_V_INT8&);
+__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&);
+__CV_V_INT16  v_mul_wrap(const __CV_V_INT16&,  const __CV_V_INT16&);
+#endif
+
+//  Multiply and expand
+#if CV_VSX
+template<typename Tvec, typename Twvec>
+void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d);
+#else
+void v_mul_expand(const __CV_V_UINT8&,  const __CV_V_UINT8&,  __CV_V_UINT16&, __CV_V_UINT16&);
+void v_mul_expand(const __CV_V_INT8&,   const __CV_V_INT8&,   __CV_V_INT16&,  __CV_V_INT16&);
+void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
+void v_mul_expand(const __CV_V_INT16&,  const __CV_V_INT16&,  __CV_V_INT32&,  __CV_V_INT32&);
+void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
+void v_mul_expand(const __CV_V_INT32&,  const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
+#endif
+
+/** Cleanup **/
+#undef CV__SIMD_FORWARD
+#undef __CV_VX
+#undef __CV_V_UINT8
+#undef __CV_V_INT8
+#undef __CV_V_UINT16
+#undef __CV_V_INT16
+#undef __CV_V_UINT32
+#undef __CV_V_INT32
+#undef __CV_V_UINT64
+#undef __CV_V_INT64
+#undef __CV_V_FLOAT32
+#undef __CV_V_FLOAT64
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/intrin_neon.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/intrin_neon.hpp
@@ -280,11 +280,10 @@ struct v_float64x2

 #if CV_FP16
 // Workaround for old compilers
-template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
-{ return (int16x4_t)a; }
-template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
-{ return (float16x4_t)a; }
-template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
+static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
+static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
+
+static inline float16x4_t cv_vld1_f16(const void* ptr)
 {
 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
    return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
@@ -292,7 +291,7 @@ template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
    return vld1_f16((const __fp16*)ptr);
 #endif
 }
-template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
+static inline void cv_vst1_f16(void* ptr, float16x4_t a)
 {
 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
    vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
@@ -301,24 +300,24 @@ template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
 #endif
 }

-struct v_float16x4
-{
-    typedef short lane_type;
-    enum { nlanes = 4 };
+#ifndef vdup_n_f16
+    #define vdup_n_f16(v) (float16x4_t){v, v, v, v}
+#endif

-    v_float16x4() {}
-    explicit v_float16x4(float16x4_t v) : val(v) {}
-    v_float16x4(short v0, short v1, short v2, short v3)
-    {
-        short v[] = {v0, v1, v2, v3};
-        val = cv_vld1_f16(v);
-    }
-    short get0() const
-    {
-        return vget_lane_s16(vreinterpret_s16_f16(val), 0);
-    }
-    float16x4_t val;
-};
+#endif // CV_FP16
+
+#if CV_FP16
+inline v_float32x4 v128_load_fp16_f32(const short* ptr)
+{
+    float16x4_t a = cv_vld1_f16((const __fp16*)ptr);
+    return v_float32x4(vcvt_f32_f16(a));
+}
+
+inline void v_store_fp16(short* ptr, const v_float32x4& a)
+{
+    float16x4_t fp16 = vcvt_f16_f32(a.val);
+    cv_vst1_f16((short*)ptr, fp16);
+}
 #endif

 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
@@ -395,6 +394,35 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn
 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)

+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
+    uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
+    return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
+    uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
+    uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
+    uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
+
+    uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
+    uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
+    return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
+}
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
@@ -436,10 +464,8 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
@@ -477,6 +503,37 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
 }
 #endif

+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+    d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+}
+
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
                         v_int32x4& c, v_int32x4& d)
 {
@@ -498,6 +555,21 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
    d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
 }

+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vcombine_s16(
+                                  vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
+                                  vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
+                                 ));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vcombine_u16(
+                                   vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
+                                   vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
+                                  ));
+}
+
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
@@ -506,6 +578,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
 }

+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    v_int32x4 s = v_dotprod(a, b);
+    return v_int32x4(vaddq_s32(s.val , c.val));
+}
+
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
@@ -686,6 +764,13 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
 #endif

+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
+#if CV_SIMD128_64F
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
+#endif
+
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
@@ -694,8 +779,11 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)

-// TODO: absdiff for signed integers
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
@@ -704,6 +792,12 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
 #endif

+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
+
 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -725,9 +819,30 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
 }

+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_SIMD128_64F
+    // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
+    // also adds FMA support both for single- and double-precision floating-point vectors
+    return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
+#else
+    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+#endif
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
+}
+
 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
-    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
 }

 #if CV_SIMD128_64F
@@ -742,9 +857,14 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
    return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
 }

+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
+}
+
 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val)));
+    return v_fma(a, b, c);
 }
 #endif

@@ -759,15 +879,7 @@ template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
-{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } \
-template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
-{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
-template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
-{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
-template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
-template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }

 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
@@ -778,6 +890,33 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)

+#define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_load(const _Tp* ptr) \
 { return _Tpvec(vld1q_##suffix(ptr)); } \
@@ -791,6 +930,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vst1q_##suffix(ptr, a.val); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@@ -809,14 +952,6 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif

-#if CV_FP16
-// Workaround for old comiplers
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(cv_vld1_f16(ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ cv_vst1_f16(ptr, a.val); }
-#endif
-
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -849,6 +984,13 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)

+#if CV_SIMD128_64F
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1);
+}
+#endif
+
 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                 const v_float32x4& c, const v_float32x4& d)
 {
@@ -864,6 +1006,49 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
    return v_float32x4(vaddq_f32(v0, v1));
 }

+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    uint32x4_t t0 = vabdq_u32(a.val, b.val);
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t t0 = vabdq_f32(a.val, b.val);
+    float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
+    return vget_lane_f32(vpadd_f32(t1, t1), 0);
+}
+
 #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
 inline v_uint32x4 v_popcount(const _Tpvec& a) \
 { \
@@ -995,6 +1180,14 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
    b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
 } \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
+} \
 inline _Tpwvec v_load_expand(const _Tp* ptr) \
 { \
    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
@@ -1009,14 +1202,16 @@ OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)

 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
 {
-    uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
+    typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
+    uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
    uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
    return v_uint32x4(vmovl_u16(v1));
 }

 inline v_int32x4 v_load_expand_q(const schar* ptr)
 {
-    int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
+    typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
+    int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
    int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
    return v_int32x4(vmovl_s16(v1));
 }
@@ -1095,6 +1290,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
 #endif

+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    float32x4_t a_ = a.val;
+    int32x4_t result;
+    __asm__ ("fcvtns %0.4s, %1.4s"
+             : "=w"(result)
+             : "w"(a_)
+             : /* No clobbers */);
+    return v_int32x4(result);
+}
+#else
 inline v_int32x4 v_round(const v_float32x4& a)
 {
    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
@@ -1103,7 +1310,7 @@ inline v_int32x4 v_round(const v_float32x4& a)
    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
 }
-
+#endif
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
    int32x4_t a1 = vcvtq_s32_f32(a.val);
@@ -1128,6 +1335,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
 }

+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
+}
+
 inline v_int32x4 v_floor(const v_float64x2& a)
 {
    static const int32x2_t zero = vdup_n_s32(0);
@@ -1202,14 +1414,16 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
    c.val = v.val[2]; \
    d.val = v.val[3]; \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    _Tpvec##x2_t v; \
    v.val[0] = a.val; \
    v.val[1] = b.val; \
    vst2q_##suffix(ptr, v); \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    _Tpvec##x3_t v; \
    v.val[0] = a.val; \
@@ -1218,7 +1432,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
    vst3q_##suffix(ptr, v); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
    _Tpvec##x4_t v; \
    v.val[0] = a.val; \
@@ -1228,6 +1443,83 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
    vst4q_##suffix(ptr, v); \
 }

+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 2); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 3); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
+                                 v_##tp##x2& b, v_##tp##x2& c ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 3); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 5); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
+                                 v_##tp##x2& c, v_##tp##x2& d ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t d0 = vld1_##suffix(ptr + 3); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 5); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 6); \
+    tp##x1_t d1 = vld1_##suffix(ptr + 7); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+    d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
+                                const v_##tp##x2& b, const v_##tp##x2& c, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                const v_##tp##x2& c, const v_##tp##x2& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
+    vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
+}
+
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
@@ -1239,6 +1531,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
 #endif

+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
+
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
    return v_float32x4(vcvtq_f32_s32(a.val));
@@ -1251,6 +1546,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
 }

+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
+}
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
    return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
@@ -1272,18 +1572,358 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }
 #endif

-#if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
-    return v_float32x4(vcvt_f32_f16(a.val));
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(vld1q_s16(elems));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(vld1q_s16(elems));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vld1q_s32(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vld1q_s64(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    uint64 CV_DECL_ALIGNED(32) elems[2] =
+    {
+        *(uint64*)(tab + idx[0]),
+        *(uint64*)(tab + idx[1])
+    };
+    return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(vld1q_f32(tab + idx[0]));
 }

-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
 {
-    return v_float16x4(vcvt_f16_f32(a.val));
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_uint32x4(vld1q_u32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    /*int CV_DECL_ALIGNED(32) idx[4];
+    v_store(idx, idxvec);
+
+    float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
+    float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
+
+    float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
+    x = v_float32x4(xxyy.val[0]);
+    y = v_float32x4(xxyy.val[1]);*/
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
+    return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
+    return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(vld1q_f64(tab + idx[0]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
 }
 #endif

+////// FP16 support ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    float16x4_t v =
+    #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
+        (float16x4_t)vld1_s16((const short*)ptr);
+    #else
+        vld1_f16((const __fp16*)ptr);
+    #endif
+    return v_float32x4(vcvt_f32_f16(v));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    float16x4_t hv = vcvt_f16_f32(v.val);
+
+    #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
+        vst1_s16((short*)ptr, (int16x4_t)hv);
+    #else
+        vst1_f16((__fp16*)ptr, hv);
+    #endif
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const int N = 4;
+    float buf[N];
+    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    const int N = 4;
+    float buf[N];
+    v_store(buf, v);
+    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+}
+#endif
+
+inline void v_cleanup() {}
+
 //! @name Check SIMD support
 //! @{
 //! @brief Check CPU capability of SIMD operation
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/intrin_sse.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/intrin_sse.hpp
--- a/lib/3rdParty/OpenCV/include/opencv2/core/hal/intrin_sse_em.hpp
+++ b/lib/3rdParty/OpenCV/include/opencv2/core/hal/intrin_sse_em.hpp
@@ -0,0 +1,167 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
+#define OPENCV_HAL_INTRIN_SSE_EM_HPP
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
+    inline tp _v128_##fun(const tp& a) \
+    { return _mm_##fun(a); }
+
+#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
+    inline tp _v128_##fun(const tp& a, const tp& b) \
+    { return _mm_##fun(a, b); }
+
+#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
+    inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
+    { return _mm_##fun(a, b, c); }
+
+///////////////////////////// XOP /////////////////////////////
+
+// [todo] define CV_XOP
+#if 1 // CV_XOP
+inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i delta = _mm_set1_epi32((int)0x80000000);
+    return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
+}
+// wrapping XOP
+#else
+OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
+#endif // !CV_XOP
+
+///////////////////////////// SSE4.1 /////////////////////////////
+
+#if !CV_SSE4_1
+
+/** Swizzle **/
+inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
+{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
+
+/** Convert **/
+// 8 >> 16
+inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi8(a, z);
+}
+inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
+{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
+// 8 >> 32
+inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
+}
+inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
+{
+    __m128i r = _mm_unpacklo_epi8(a, a);
+    r = _mm_unpacklo_epi8(r, r);
+    return _mm_srai_epi32(r, 24);
+}
+// 16 >> 32
+inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi16(a, z);
+}
+inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
+{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
+// 32 >> 64
+inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi32(a, z);
+}
+inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
+{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
+
+/** Arithmetic **/
+inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
+{
+    __m128i c0 = _mm_mul_epu32(a, b);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return _mm_unpacklo_epi64(d0, d1);
+}
+
+/** Math **/
+inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
+{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
+
+// wrapping SSE4.1
+#else
+OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
+OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
+OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
+#endif // !CV_SSE4_1
+
+///////////////////////////// Revolutionary /////////////////////////////
+
+/** Convert **/
+// 16 << 8
+inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi8(a, z);
+}
+inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
+{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
+// 32 << 16
+inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi16(a, z);
+}
+inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
+{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
+// 64 << 32
+inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi32(a, z);
+}
+inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
+{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
+
+/** Miscellaneous **/
+inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i m = _mm_set1_epi32(65535);
+    __m128i am = _v128_min_epu32(a, m);
+    __m128i bm = _v128_min_epu32(b, m);
+#if CV_SSE4_1
+    return _mm_packus_epi32(am, bm);
+#else
+    const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
+    am = _mm_sub_epi32(am, d);
+    bm = _mm_sub_epi32(bm, d);
+    am = _mm_packs_epi32(am, bm);
+    return _mm_sub_epi16(am, nd);
+#endif
+}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/hal/intrin_vsx.hpp
@@ -1,46 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html

 #ifndef OPENCV_HAL_VSX_HPP
 #define OPENCV_HAL_VSX_HPP
@@ -51,11 +11,6 @@
 #define CV_SIMD128 1
 #define CV_SIMD128_64F 1

-/**
- * todo: supporting half precision for power9
- * convert instractions xvcvhpsp, xvcvsphp
-**/
-
 namespace cv
 {

@@ -276,34 +231,50 @@ OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
 OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
 OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)

-#define OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(_Tpvec, _Tp, ld_func, st_func) \
+#define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a)    \
 inline _Tpvec v_load(const _Tp* ptr)                                        \
-{ return _Tpvec(ld_func(0, ptr)); }                                         \
-inline _Tpvec v_load_aligned(const _Tp* ptr)                                \
-{ return _Tpvec(ld_func(0, ptr)); }                                         \
+{ return _Tpvec(ld(0, ptr)); }                                              \
+inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr))                    \
+{ return _Tpvec(ld_a(0, ptr)); }                                            \
 inline _Tpvec v_load_low(const _Tp* ptr)                                    \
 { return _Tpvec(vec_ld_l8(ptr)); }                                          \
 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)               \
 { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); }          \
 inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
-{ st_func(a.val, 0, ptr); }                                                 \
-inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                      \
-{ st_func(a.val, 0, ptr); }                                                 \
+{ st(a.val, 0, ptr); }                                                      \
+inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)  \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)         \
+{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
 { vec_st_l8(a.val, ptr); }                                                  \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
 { vec_st_h8(a.val, ptr); }

-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint8x16, uchar, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int8x16, schar, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint16x8, ushort, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int16x8, short, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint32x4, uint, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int32x4, int, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_float32x4, float, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_float64x2, double, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint64x2, uint64, vsx_ld2, vsx_st2)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int64x2, int64, vsx_ld2, vsx_st2)
+// working around gcc bug for aligned ld/st
+// if runtime check for vec_ld/st fail we failback to unaligned ld/st
+// https://github.com/opencv/opencv/issues/13211
+#ifdef CV_COMPILER_VSX_BROKEN_ALIGNED
+    #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
+    OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vsx_ld, vsx_st, vsx_st)
+#else
+    #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
+    OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
+#endif
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16,  uchar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16,   schar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8,  ushort)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8,   short)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4,  uint)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4,   int)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld,  vsx_ld,  vsx_st,  vsx_st)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2,  uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2,    int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)

 //////////////// Value reordering ///////////////

@@ -317,13 +288,16 @@ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a,                   \
 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b,        \
                                                _Tpvec& c, _Tpvec& d)        \
 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); }                    \
-inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)   \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, ptr); }                                    \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a,                    \
-                               const _Tpvec& b, const _Tpvec& c)             \
+                               const _Tpvec& b, const _Tpvec& c,             \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, ptr); }                             \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
-                                         const _Tpvec& c, const _Tpvec& d)   \
+                                         const _Tpvec& c, const _Tpvec& d,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }

 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
@@ -334,6 +308,8 @@ OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)

 /* Expand */
 #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh)  \
@@ -342,8 +318,12 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
    b0.val = fh(a.val);                                           \
    b1.val = fl(a.val);                                           \
 }                                                                 \
+inline _Tpwvec v_expand_low(const _Tpvec& a)                      \
+{ return _Tpwvec(fh(a.val)); }                                    \
+inline _Tpwvec v_expand_high(const _Tpvec& a)                     \
+{ return _Tpwvec(fl(a.val)); }                                    \
 inline _Tpwvec v_load_expand(const _Tp* ptr)                      \
-{ return _Tpwvec(fh(vsx_ld(0, ptr))); }
+{ return _Tpwvec(fh(vec_ld_l8(ptr))); }

 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
 OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
@@ -353,10 +333,10 @@ OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpac
 OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)

 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{ return v_uint32x4(vec_ld_buw(ptr)); }
+{ return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }

 inline v_int32x4 v_load_expand_q(const schar* ptr)
-{ return v_int32x4(vec_ld_bsw(ptr)); }
+{ return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }

 /* pack */
 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack)    \
@@ -406,6 +386,35 @@ OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
 //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
 //                         vec_sra, vec_packsu, vec_add, pack_u)

+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uchar16 ab = vec_pack(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vec_ushort8 ab = vec_pack(a.val, b.val);
+    vec_ushort8 cd = vec_pack(c.val, d.val);
+    return v_uint8x16(vec_pack(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vec_uint4 ab = vec_pack(a.val, b.val);
+    vec_uint4 cd = vec_pack(c.val, d.val);
+    vec_uint4 ef = vec_pack(e.val, f.val);
+    vec_uint4 gh = vec_pack(g.val, h.val);
+
+    vec_ushort8 abcd = vec_pack(ab, cd);
+    vec_ushort8 efgh = vec_pack(ef, gh);
+    return v_uint8x16(vec_pack(abcd, efgh));
+}
+
 /* Recombine */
 template <typename _Tpvec>
 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
@@ -429,36 +438,6 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
    d.val = vec_mergesql(a.val, b.val);
 }

-/* Extract */
-template<int s, typename _Tpvec>
-inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
-{
-    const int w = sizeof(typename _Tpvec::lane_type);
-    const int n = _Tpvec::nlanes;
-    const unsigned int sf = ((w * n) - (s * w));
-    if (s == 0)
-        return _Tpvec(a.val);
-    else if (sf > 15)
-        return _Tpvec();
-    // bitwise it just to make xlc happy
-    return _Tpvec(vec_sld(b.val, a.val, sf & 15));
-}
-
-#define OPENCV_HAL_IMPL_VSX_EXTRACT_2(_Tpvec)             \
-template<int s>                                           \
-inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
-{                                                         \
-    switch(s) {                                           \
-    case 0: return _Tpvec(a.val);                         \
-    case 2: return _Tpvec(b.val);                         \
-    case 1: return _Tpvec(vec_sldw(b.val, a.val, 2));     \
-    default: return _Tpvec();                             \
-    }                                                     \
-}
-OPENCV_HAL_IMPL_VSX_EXTRACT_2(v_uint64x2)
-OPENCV_HAL_IMPL_VSX_EXTRACT_2(v_int64x2)
-
-
 ////////// Arithmetic, bitwise and comparison operations /////////

 /* Element-wise binary and unary operations */
@@ -475,10 +454,8 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16,  vec_adds)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint16x8, vec_mul)
 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int16x8, vec_mul)
 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
@@ -498,22 +475,51 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)

-inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d)
+// saturating multiply
+#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec)             \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
+
+template<typename Tvec, typename Twvec>
+inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
 {
-    c.val = vec_mul(vec_unpackh(a.val), vec_unpackh(b.val));
-    d.val = vec_mul(vec_unpackl(a.val), vec_unpackl(b.val));
-}
-inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d)
-{
-    c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
-    d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
+    Twvec p0 = Twvec(vec_mule(a.val, b.val));
+    Twvec p1 = Twvec(vec_mulo(a.val, b.val));
+    v_zip(p0, p1, c, d);
 }
+
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
 {
    c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
    d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
 }

+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    vec_int4 p0 = vec_mule(a.val, b.val);
+    vec_int4 p1 = vec_mulo(a.val, b.val);
+    static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+    return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uint4 p0 = vec_mule(a.val, b.val);
+    vec_uint4 p1 = vec_mulo(a.val, b.val);
+    static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+    return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
+}
+
 /** Non-saturating arithmetics **/
 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin)    \
 template<typename _Tpvec>                             \
@@ -522,6 +528,7 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)  \

 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)

 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc)   \
@@ -603,6 +610,11 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)

+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vec_cmpeq(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vec_cmpeq(a.val, a.val)); }
+
 /** min/max **/
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
@@ -628,9 +640,10 @@ OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8,  vec_short8)
 OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4,  vec_int4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
 OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
 OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2,  vec_dword2)
-
+OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)

 template<int imm, typename _Tpvec>
 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
@@ -654,7 +667,7 @@ inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
    return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
 }

-#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2)       \
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2)   \
 template<int imm>                                                 \
 inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
 {                                                                 \
@@ -663,11 +676,18 @@ inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
    return imm ? b : a;                                           \
 }

-OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  right, a, b)
-OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec)    \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left,  b, a)  \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)

-OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  left, b, a)
-OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
+
+/* Extract */
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{ return v_rotate_right<s>(a, b); }

 ////////// Reduce and mask /////////

@@ -699,6 +719,11 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)

+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+
 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
 inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
 {                                                                                  \
@@ -722,6 +747,50 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
    return v_float32x4(vec_mergeh(ac, bd));
 }

+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    const vec_uint4 zero4 = vec_uint4_z;
+    vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
+    return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
+    vec_int4 sum4 = vec_sum4s(ad, zero4);
+    return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_ushort8 ad = vec_absd(a.val, b.val);
+    VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)), vec_int4_c(vec_unpacklu(ad)));
+    return (unsigned)vec_extract(sum, 3);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
+    vec_int4 sum4 = vec_sum4s(ad, zero4);
+    return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    const vec_uint4 ad = vec_absd(a.val, b.val);
+    const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
+    return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
+    return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
+    const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
+    return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
+}
+
 /** Popcount **/
 template<typename _Tpvec>
 inline v_uint32x4 v_popcount(const _Tpvec& a)
@@ -776,26 +845,33 @@ inline int v_signmask(const v_uint64x2& a)
 inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_s64(a)); }

-
 template<typename _Tpvec>
 inline bool v_check_all(const _Tpvec& a)
-{ return vec_all_lt(a.val, _Tpvec().val);}
-inline bool v_check_all(const v_uint8x16 &a)
+{ return vec_all_lt(a.val, _Tpvec().val); }
+inline bool v_check_all(const v_uint8x16& a)
 { return v_check_all(v_reinterpret_as_s8(a)); }
-inline bool v_check_all(const v_uint16x8 &a)
+inline bool v_check_all(const v_uint16x8& a)
 { return v_check_all(v_reinterpret_as_s16(a)); }
-inline bool v_check_all(const v_uint32x4 &a)
+inline bool v_check_all(const v_uint32x4& a)
 { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }

 template<typename _Tpvec>
 inline bool v_check_any(const _Tpvec& a)
-{ return vec_any_lt(a.val, _Tpvec().val);}
-inline bool v_check_any(const v_uint8x16 &a)
+{ return vec_any_lt(a.val, _Tpvec().val); }
+inline bool v_check_any(const v_uint8x16& a)
 { return v_check_any(v_reinterpret_as_s8(a)); }
-inline bool v_check_any(const v_uint16x8 &a)
+inline bool v_check_any(const v_uint16x8& a)
 { return v_check_any(v_reinterpret_as_s16(a)); }
-inline bool v_check_any(const v_uint32x4 &a)
+inline bool v_check_any(const v_uint32x4& a)
 { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }

 ////////// Other math /////////

@@ -815,12 +891,17 @@ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); }           \
+inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }                           \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }

 OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)

+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return a * b + c; }
+
 // TODO: exp, log, sin, cos

 /** Absolute values **/
@@ -839,25 +920,39 @@ inline v_float32x4 v_abs(const v_float32x4& x)
 inline v_float64x2 v_abs(const v_float64x2& x)
 { return v_float64x2(vec_abs(x.val)); }

+/** Absolute difference **/
+// unsigned
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)

-#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin)  \
-inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b)                       \
-{ return _Tpvec2(cast(intrin(a.val, b.val))); }
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }

-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd)
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(a - b); }
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(a - b); }
+
+/** Absolute difference for signed integers **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }

 ////////// Conversions /////////

 /** Rounding **/
 inline v_int32x4 v_round(const v_float32x4& a)
-{ return v_int32x4(vec_cts(vec_round(a.val))); }
+{ return v_int32x4(vec_cts(vec_rint(a.val))); }

 inline v_int32x4 v_round(const v_float64x2& a)
-{ return v_int32x4(vec_mergesqo(vec_ctso(vec_round(a.val)), vec_int4_z)); }
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }

 inline v_int32x4 v_floor(const v_float32x4& a)
 { return v_int32x4(vec_cts(vec_floor(a.val))); }
@@ -884,6 +979,9 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }

+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 { return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }

@@ -896,6 +994,271 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }

+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]],
+                     tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]),
+                                       *(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7])));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3])));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vsx_ld(0, tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vsx_ld2(0, tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); }
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); }
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    const int idx[2] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1)
+    };
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    vec_float4 xy0 = vec_ld_l8(tab + vec_extract(idxvec.val, 0));
+    vec_float4 xy1 = vec_ld_l8(tab + vec_extract(idxvec.val, 1));
+    vec_float4 xy2 = vec_ld_l8(tab + vec_extract(idxvec.val, 2));
+    vec_float4 xy3 = vec_ld_l8(tab + vec_extract(idxvec.val, 3));
+    vec_float4 xy02 = vec_mergeh(xy0, xy2); // x0, x2, y0, y2
+    vec_float4 xy13 = vec_mergeh(xy1, xy3); // x1, x3, y1, y3
+    x.val = vec_mergeh(xy02, xy13);
+    y.val = vec_mergel(xy02, xy13);
+}
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    vec_double2 xy0 = vsx_ld(vec_extract(idxvec.val, 0), tab);
+    vec_double2 xy1 = vsx_ld(vec_extract(idxvec.val, 1), tab);
+    x.val = vec_mergeh(xy0, xy1);
+    y.val = vec_mergel(xy0, xy1);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 4,5, 2,3, 6,7, 8,9, 12,13, 10,11, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    static const vec_uchar16 perm = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15};
+    return v_int32x4(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 15, 15, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 2,3, 4,5, 8,9, 10,11, 12,13, 14,15, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec)
+{ return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec)
+{ return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
+{ return vec; }
+
+/////// FP16 support ////////
+
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
+#if CV_VSX3 && defined(vec_extract_fp_from_shorth)
+    return v_float32x4(vec_extract_fp_from_shorth(vf16));
+#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    vec_float4 vf32;
+    __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wf" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
+    return v_float32x4(vf32);
+#else
+    const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
+    const vec_int4 signmask = vec_int4_sp(0x80000000);
+    const vec_int4 maxexp = vec_int4_sp(0x7c000000);
+    const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
+
+    vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
+    vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
+    vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
+    vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
+
+    t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
+    vec_bint4 zmask = vec_cmpeq(e, z);
+    vec_int4 ft = vec_sel(t, zt, zmask);
+    return v_float32x4(vec_float4_c(vec_or(ft, sign)));
+#endif
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+// fixme: Is there any buitin op or intrinsic that cover "xvcvsphp"?
+#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    vec_ushort8 vf16;
+    __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wf" (v.val));
+    vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
+#else
+    const vec_int4 signmask = vec_int4_sp(0x80000000);
+    const vec_int4 rval = vec_int4_sp(0x3f000000);
+
+    vec_int4 t = vec_int4_c(v.val);
+    vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
+    t = vec_and(vec_nor(signmask, signmask), t);
+
+    vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
+    vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
+    vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
+    vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
+    vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
+    tt = vec_sub(tt, rval);
+    vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
+    vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
+    nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
+    t = vec_sel(nt, tt, tinymask);
+    t = vec_sel(naninf, t, finitemask);
+    t = vec_or(t, sign);
+    vec_st_l8(vec_packs(t, t), ptr);
+#endif
+}
+
+inline void v_cleanup() {}
+
+
 /** Reinterpret **/
 /** its up there with load and store operations **/

@@ -904,6 +1267,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }

+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/mat.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/mat.hpp
@@ -53,9 +53,7 @@

 #include "opencv2/core/bufferpool.hpp"

-#ifdef CV_CXX11
 #include <type_traits>
-#endif

 namespace cv
 {
@@ -63,8 +61,10 @@ namespace cv
 //! @addtogroup core_basic
 //! @{

-enum { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
+enum AccessFlag { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
    ACCESS_RW=3<<24, ACCESS_MASK=ACCESS_RW, ACCESS_FAST=1<<26 };
+CV_ENUM_FLAGS(AccessFlag)
+__CV_ENUM_FLAGS_BITWISE_AND(AccessFlag, int, AccessFlag)

 CV__DEBUG_NS_BEGIN

@@ -148,11 +148,17 @@ synonym is needed to generate Python/Java etc. wrappers properly. At the functio
 level their use is similar, but _InputArray::getMat(idx) should be used to get header for the
 idx-th component of the outer vector and _InputArray::size().area() should be used to find the
 number of components (vectors/matrices) of the outer vector.
+
+In general, type support is limited to cv::Mat types. Other types are forbidden.
+But in some cases we need to support passing of custom non-general Mat types, like arrays of cv::KeyPoint, cv::DMatch, etc.
+This data is not intented to be interpreted as an image data, or processed somehow like regular cv::Mat.
+To pass such custom type use rawIn() / rawOut() / rawInOut() wrappers.
+Custom type is wrapped as Mat-compatible `CV_8UC<N>` values (N = sizeof(T), N <= CV_CN_MAX).
 */
 class CV_EXPORTS _InputArray
 {
 public:
-    enum {
+    enum KindFlag {
        KIND_SHIFT = 16,
        FIXED_TYPE = 0x8000 << KIND_SHIFT,
        FIXED_SIZE = 0x4000 << KIND_SHIFT,
@@ -185,7 +191,7 @@ public:
    template<typename _Tp> _InputArray(const std::vector<_Tp>& vec);
    _InputArray(const std::vector<bool>& vec);
    template<typename _Tp> _InputArray(const std::vector<std::vector<_Tp> >& vec);
-    _InputArray(const std::vector<std::vector<bool> >&);
+    _InputArray(const std::vector<std::vector<bool> >&) = delete;  // not supported
    template<typename _Tp> _InputArray(const std::vector<Mat_<_Tp> >& vec);
    template<typename _Tp> _InputArray(const _Tp* vec, int n);
    template<typename _Tp, int m, int n> _InputArray(const Matx<_Tp, m, n>& matx);
@@ -198,10 +204,11 @@ public:
    _InputArray(const UMat& um);
    _InputArray(const std::vector<UMat>& umv);

-#ifdef CV_CXX_STD_ARRAY
    template<typename _Tp, std::size_t _Nm> _InputArray(const std::array<_Tp, _Nm>& arr);
    template<std::size_t _Nm> _InputArray(const std::array<Mat, _Nm>& arr);
-#endif
+
+    template<typename _Tp> static _InputArray rawIn(const std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> static _InputArray rawIn(const std::array<_Tp, _Nm>& arr);

    Mat getMat(int idx=-1) const;
    Mat getMat_(int idx=-1) const;
@@ -216,7 +223,7 @@ public:
    void* getObj() const;
    Size getSz() const;

-    int kind() const;
+    _InputArray::KindFlag kind() const;
    int dims(int i=-1) const;
    int cols(int i=-1) const;
    int rows(int i=-1) const;
@@ -240,6 +247,7 @@ public:
    bool isUMatVector() const;
    bool isMatx() const;
    bool isVector() const;
+    bool isGpuMat() const;
    bool isGpuMatVector() const;
    ~_InputArray();

@@ -251,7 +259,8 @@ protected:
    void init(int _flags, const void* _obj);
    void init(int _flags, const void* _obj, Size _sz);
 };
-
+CV_ENUM_FLAGS(_InputArray::KindFlag)
+__CV_ENUM_FLAGS_BITWISE_AND(_InputArray::KindFlag, int, _InputArray::KindFlag)

 /** @brief This type is very similar to InputArray except that it is used for input/output and output function
 parameters.
@@ -281,7 +290,7 @@ generators:
 class CV_EXPORTS _OutputArray : public _InputArray
 {
 public:
-    enum
+    enum DepthMask
    {
        DEPTH_MASK_8U = 1 << CV_8U,
        DEPTH_MASK_8S = 1 << CV_8S,
@@ -290,8 +299,10 @@ public:
        DEPTH_MASK_32S = 1 << CV_32S,
        DEPTH_MASK_32F = 1 << CV_32F,
        DEPTH_MASK_64F = 1 << CV_64F,
+        DEPTH_MASK_16F = 1 << CV_16F,
        DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1,
        DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S,
+        DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1,
        DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F
    };

@@ -305,9 +316,9 @@ public:
    _OutputArray(cuda::HostMem& cuda_mem);
    template<typename _Tp> _OutputArray(cudev::GpuMat_<_Tp>& m);
    template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
-    _OutputArray(std::vector<bool>& vec);
+    _OutputArray(std::vector<bool>& vec) = delete;  // not supported
    template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
-    _OutputArray(std::vector<std::vector<bool> >&);
+    _OutputArray(std::vector<std::vector<bool> >&) = delete;  // not supported
    template<typename _Tp> _OutputArray(std::vector<Mat_<_Tp> >& vec);
    template<typename _Tp> _OutputArray(Mat_<_Tp>& m);
    template<typename _Tp> _OutputArray(_Tp* vec, int n);
@@ -331,12 +342,13 @@ public:
    _OutputArray(const UMat& m);
    _OutputArray(const std::vector<UMat>& vec);

-#ifdef CV_CXX_STD_ARRAY
    template<typename _Tp, std::size_t _Nm> _OutputArray(std::array<_Tp, _Nm>& arr);
    template<typename _Tp, std::size_t _Nm> _OutputArray(const std::array<_Tp, _Nm>& arr);
    template<std::size_t _Nm> _OutputArray(std::array<Mat, _Nm>& arr);
    template<std::size_t _Nm> _OutputArray(const std::array<Mat, _Nm>& arr);
-#endif
+
+    template<typename _Tp> static _OutputArray rawOut(std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> static _OutputArray rawOut(std::array<_Tp, _Nm>& arr);

    bool fixedSize() const;
    bool fixedType() const;
@@ -347,9 +359,9 @@ public:
    std::vector<cuda::GpuMat>& getGpuMatVecRef() const;
    ogl::Buffer& getOGlBufferRef() const;
    cuda::HostMem& getHostMemRef() const;
-    void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
-    void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
-    void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    void create(Size sz, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
+    void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
+    void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
    void createSameSize(const _InputArray& arr, int mtype) const;
    void release() const;
    void clear() const;
@@ -375,7 +387,7 @@ public:
    _InputOutputArray(cuda::HostMem& cuda_mem);
    template<typename _Tp> _InputOutputArray(cudev::GpuMat_<_Tp>& m);
    template<typename _Tp> _InputOutputArray(std::vector<_Tp>& vec);
-    _InputOutputArray(std::vector<bool>& vec);
+    _InputOutputArray(std::vector<bool>& vec) = delete;  // not supported
    template<typename _Tp> _InputOutputArray(std::vector<std::vector<_Tp> >& vec);
    template<typename _Tp> _InputOutputArray(std::vector<Mat_<_Tp> >& vec);
    template<typename _Tp> _InputOutputArray(Mat_<_Tp>& m);
@@ -400,15 +412,23 @@ public:
    _InputOutputArray(const UMat& m);
    _InputOutputArray(const std::vector<UMat>& vec);

-#ifdef CV_CXX_STD_ARRAY
    template<typename _Tp, std::size_t _Nm> _InputOutputArray(std::array<_Tp, _Nm>& arr);
    template<typename _Tp, std::size_t _Nm> _InputOutputArray(const std::array<_Tp, _Nm>& arr);
    template<std::size_t _Nm> _InputOutputArray(std::array<Mat, _Nm>& arr);
    template<std::size_t _Nm> _InputOutputArray(const std::array<Mat, _Nm>& arr);
-#endif
+
+    template<typename _Tp> static _InputOutputArray rawInOut(std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> _InputOutputArray rawInOut(std::array<_Tp, _Nm>& arr);

 };

+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _InputArray rawIn(_Tp& v);
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _OutputArray rawOut(_Tp& v);
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _InputOutputArray rawInOut(_Tp& v);
+
 CV__DEBUG_NS_END

 typedef const _InputArray& InputArray;
@@ -450,10 +470,10 @@ public:
    //                      uchar*& datastart, uchar*& data, size_t* step) = 0;
    //virtual void deallocate(int* refcount, uchar* datastart, uchar* data) = 0;
    virtual UMatData* allocate(int dims, const int* sizes, int type,
-                               void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const = 0;
-    virtual bool allocate(UMatData* data, int accessflags, UMatUsageFlags usageFlags) const = 0;
+                               void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const = 0;
+    virtual bool allocate(UMatData* data, AccessFlag accessflags, UMatUsageFlags usageFlags) const = 0;
    virtual void deallocate(UMatData* data) const = 0;
-    virtual void map(UMatData* data, int accessflags) const;
+    virtual void map(UMatData* data, AccessFlag accessflags) const;
    virtual void unmap(UMatData* data) const;
    virtual void download(UMatData* data, void* dst, int dims, const size_t sz[],
                          const size_t srcofs[], const size_t srcstep[],
@@ -506,7 +526,7 @@ protected:
 // it should be explicitly initialized using init().
 struct CV_EXPORTS UMatData
 {
-    enum { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
+    enum MemoryFlag { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
        DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24,
        USER_ALLOCATED=32, DEVICE_MEM_MAPPED=64,
        ASYNC_CLEANUP=128
@@ -536,30 +556,24 @@ struct CV_EXPORTS UMatData
    uchar* origdata;
    size_t size;

-    int flags;
+    UMatData::MemoryFlag flags;
    void* handle;
    void* userdata;
    int allocatorFlags_;
    int mapcount;
    UMatData* originalUMatData;
 };
-
-
-struct CV_EXPORTS UMatDataAutoLock
-{
-    explicit UMatDataAutoLock(UMatData* u);
-    ~UMatDataAutoLock();
-    UMatData* u;
-};
+CV_ENUM_FLAGS(UMatData::MemoryFlag)


 struct CV_EXPORTS MatSize
 {
    explicit MatSize(int* _p);
+    int dims() const;
    Size operator()() const;
    const int& operator[](int i) const;
    int& operator[](int i);
-    operator const int*() const;
+    operator const int*() const;  // TODO OpenCV 4.0: drop this
    bool operator == (const MatSize& sz) const;
    bool operator != (const MatSize& sz) const;

@@ -581,7 +595,7 @@ protected:
    MatStep& operator = (const MatStep&);
 };

-/** @example cout_mat.cpp
+/** @example samples/cpp/cout_mat.cpp
 An example demonstrating the serial out capabilities of cv::Mat
 */

@@ -600,12 +614,11 @@ Note that `M.step[i] >= M.step[i+1]` (in fact, `M.step[i] >= M.step[i+1]*M.size[
 that 2-dimensional matrices are stored row-by-row, 3-dimensional matrices are stored plane-by-plane,
 and so on. M.step[M.dims-1] is minimal and always equal to the element size M.elemSize() .

-So, the data layout in Mat is fully compatible with CvMat, IplImage, and CvMatND types from OpenCV
-1.x. It is also compatible with the majority of dense array types from the standard toolkits and
-SDKs, such as Numpy (ndarray), Win32 (independent device bitmaps), and others, that is, with any
-array that uses *steps* (or *strides*) to compute the position of a pixel. Due to this
-compatibility, it is possible to make a Mat header for user-allocated data and process it in-place
-using OpenCV functions.
+So, the data layout in Mat is compatible with the majority of dense array types from the standard
+toolkits and SDKs, such as Numpy (ndarray), Win32 (independent device bitmaps), and others,
+that is, with any array that uses *steps* (or *strides*) to compute the position of a pixel.
+Due to this compatibility, it is possible to make a Mat header for user-allocated data and process
+it in-place using OpenCV functions.

 There are many different ways to create a Mat object. The most popular options are listed below:

@@ -690,10 +703,6 @@ sub-matrices.
        Mat M = Mat(3, 3, CV_64F, m).inv();
    @endcode
    .
-    Partial yet very common cases of this *user-allocated data* case are conversions from CvMat and
-    IplImage to Mat. For this purpose, there is function cv::cvarrToMat taking pointers to CvMat or
-    IplImage and the optional flag indicating whether to copy the data or not.
-    @snippet samples/cpp/image.cpp iplimage

 - Use MATLAB-style array initializers, zeros(), ones(), eye(), for example:
@code
@@ -992,18 +1001,18 @@ public:
    */
    template<typename _Tp> explicit Mat(const std::vector<_Tp>& vec, bool copyData=false);

-#ifdef CV_CXX11
    /** @overload
    */
    template<typename _Tp, typename = typename std::enable_if<std::is_arithmetic<_Tp>::value>::type>
    explicit Mat(const std::initializer_list<_Tp> list);
-#endif

-#ifdef CV_CXX_STD_ARRAY
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> list);
+
    /** @overload
    */
    template<typename _Tp, size_t _Nm> explicit Mat(const std::array<_Tp, _Nm>& arr, bool copyData=false);
-#endif

    /** @overload
    */
@@ -1051,7 +1060,7 @@ public:
    Mat& operator = (const MatExpr& expr);

    //! retrieve UMat from Mat
-    UMat getUMat(int accessFlags, UMatUsageFlags usageFlags = USAGE_DEFAULT) const;
+    UMat getUMat(AccessFlag accessFlags, UMatUsageFlags usageFlags = USAGE_DEFAULT) const;

    /** @brief Creates a matrix header for the specified matrix row.

@@ -1173,7 +1182,7 @@ public:
    The method creates a full copy of the array. The original step[] is not taken into account. So, the
    array copy is a continuous array occupying total()*elemSize() bytes.
     */
-    Mat clone() const;
+    Mat clone() const CV_NODISCARD;

    /** @brief Copies the matrix to another one.

@@ -1326,7 +1335,7 @@ public:
    /** @brief Returns a zero array of the specified size and type.

    The method returns a Matlab-style zero array initializer. It can be used to quickly form a constant
-    array as a function parameter, part of a matrix expression, or as a matrix initializer. :
+    array as a function parameter, part of a matrix expression, or as a matrix initializer:
    @code
        Mat A;
        A = Mat::zeros(3, 3, CV_32F);
@@ -1362,6 +1371,8 @@ public:
    The above operation does not form a 100x100 matrix of 1's and then multiply it by 3. Instead, it
    just remembers the scale factor (3 in this case) and use it when actually invoking the matrix
    initializer.
+    @note In case of multi-channels type, only the first channel will be initialized with 1's, the
+    others will be set to 0's.
    @param rows Number of rows.
    @param cols Number of columns.
    @param type Created matrix type.
@@ -1389,6 +1400,8 @@ public:
        // make a 4x4 diagonal matrix with 0.1's on the diagonal.
        Mat A = Mat::eye(4, 4, CV_32F)*0.1;
    @endcode
+    @note In case of multi-channels type, identity matrix will be initialized only for the first channel,
+    the others will be set to 0's
    @param rows Number of rows.
    @param cols Number of columns.
    @param type Created matrix type.
@@ -1623,20 +1636,11 @@ public:
    */
    Mat operator()(const std::vector<Range>& ranges) const;

-    // //! converts header to CvMat; no data is copied
-    // operator CvMat() const;
-    // //! converts header to CvMatND; no data is copied
-    // operator CvMatND() const;
-    // //! converts header to IplImage; no data is copied
-    // operator IplImage() const;
-
    template<typename _Tp> operator std::vector<_Tp>() const;
    template<typename _Tp, int n> operator Vec<_Tp, n>() const;
    template<typename _Tp, int m, int n> operator Matx<_Tp, m, n>() const;

-#ifdef CV_CXX_STD_ARRAY
    template<typename _Tp, std::size_t _Nm> operator std::array<_Tp, _Nm>() const;
-#endif

    /** @brief Reports whether the matrix is continuous or not.

@@ -1786,7 +1790,27 @@ public:
     */
    size_t total(int startDim, int endDim=INT_MAX) const;

-    //! returns N if the matrix is 1-channel (N x ptdim) or ptdim-channel (1 x N) or (N x 1); negative number otherwise
+    /**
+     * @param elemChannels Number of channels or number of columns the matrix should have.
+     *                     For a 2-D matrix, when the matrix has only 1 column, then it should have
+     *                     elemChannels channels; When the matrix has only 1 channel,
+     *                     then it should have elemChannels columns.
+     *                     For a 3-D matrix, it should have only one channel. Furthermore,
+     *                     if the number of planes is not one, then the number of rows
+     *                     within every plane has to be 1; if the number of rows within
+     *                     every plane is not 1, then the number of planes has to be 1.
+     * @param depth The depth the matrix should have. Set it to -1 when any depth is fine.
+     * @param requireContinuous Set it to true to require the matrix to be continuous
+     * @return -1 if the requirement is not satisfied.
+     *         Otherwise, it returns the number of elements in the matrix. Note
+     *         that an element may have multiple channels.
+     *
+     * The following code demonstrates its usage for a 2-d matrix:
+     * @snippet snippets/core_mat_checkVector.cpp example-2d
+     *
+     * The following code demonstrates its usage for a 3-d matrix:
+     * @snippet snippets/core_mat_checkVector.cpp example-3d
+     */
    int checkVector(int elemChannels, int depth=-1, bool requireContinuous=true) const;

    /** @brief Returns a pointer to the specified matrix row.
@@ -2040,10 +2064,8 @@ public:
    /** @overload */
    template<typename _Tp, typename Functor> void forEach(const Functor& operation) const;

-#ifdef CV_CXX_MOVE_SEMANTICS
    Mat(Mat&& m);
    Mat& operator = (Mat&& m);
-#endif

    enum { MAGIC_VAL  = 0x42FF0000, AUTO_STEP = 0, CONTINUOUS_FLAG = CV_MAT_CONT_FLAG, SUBMATRIX_FLAG = CV_SUBMAT_FLAG };
    enum { MAGIC_MASK = 0xFFFF0000, TYPE_MASK = 0x00000FFF, DEPTH_MASK = 7 };
@@ -2074,6 +2096,9 @@ public:
    static MatAllocator* getDefaultAllocator();
    static void setDefaultAllocator(MatAllocator* allocator);

+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
    //! interaction with UMat
    UMatData* u;

@@ -2168,7 +2193,7 @@ public:
    Mat_(int _ndims, const int* _sizes);
    //! n-dim array constructor that sets each matrix element to specified value
    Mat_(int _ndims, const int* _sizes, const _Tp& value);
-    //! copy/conversion contructor. If m is of different type, it's converted
+    //! copy/conversion constructor. If m is of different type, it's converted
    Mat_(const Mat& m);
    //! copy constructor
    Mat_(const Mat_& m);
@@ -2194,13 +2219,10 @@ public:
    explicit Mat_(const Point3_<typename DataType<_Tp>::channel_type>& pt, bool copyData=true);
    explicit Mat_(const MatCommaInitializer_<_Tp>& commaInitializer);

-#ifdef CV_CXX11
    Mat_(std::initializer_list<_Tp> values);
-#endif
+    explicit Mat_(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> values);

-#ifdef CV_CXX_STD_ARRAY
    template <std::size_t _Nm> explicit Mat_(const std::array<_Tp, _Nm>& arr, bool copyData=false);
-#endif

    Mat_& operator = (const Mat& m);
    Mat_& operator = (const Mat_& m);
@@ -2236,7 +2258,7 @@ public:
    Mat_ row(int y) const;
    Mat_ col(int x) const;
    Mat_ diag(int d=0) const;
-    Mat_ clone() const;
+    Mat_ clone() const CV_NODISCARD;

    //! overridden forms of Mat::elemSize() etc.
    size_t elemSize() const;
@@ -2258,7 +2280,7 @@ public:
    static MatExpr eye(int rows, int cols);
    static MatExpr eye(Size size);

-    //! some more overriden methods
+    //! some more overridden methods
    Mat_& adjustROI( int dtop, int dbottom, int dleft, int dright );
    Mat_ operator()( const Range& rowRange, const Range& colRange ) const;
    Mat_ operator()( const Rect& roi ) const;
@@ -2298,17 +2320,14 @@ public:
    //! conversion to vector.
    operator std::vector<_Tp>() const;

-#ifdef CV_CXX_STD_ARRAY
    //! conversion to array.
    template<std::size_t _Nm> operator std::array<_Tp, _Nm>() const;
-#endif

    //! conversion to Vec
    template<int n> operator Vec<typename DataType<_Tp>::channel_type, n>() const;
    //! conversion to Matx
    template<int m, int n> operator Matx<typename DataType<_Tp>::channel_type, m, n>() const;

-#ifdef CV_CXX_MOVE_SEMANTICS
    Mat_(Mat_&& m);
    Mat_& operator = (Mat_&& m);

@@ -2316,7 +2335,6 @@ public:
    Mat_& operator = (Mat&& m);

    Mat_(MatExpr&& e);
-#endif
 };

 typedef Mat_<uchar> Mat1b;
@@ -2394,7 +2412,7 @@ public:
    //! assignment operators
    UMat& operator = (const UMat& m);

-    Mat getMat(int flags) const;
+    Mat getMat(AccessFlag flags) const;

    //! returns a new matrix header for the specified row
    UMat row(int y) const;
@@ -2415,7 +2433,7 @@ public:
    static UMat diag(const UMat& d);

    //! returns deep copy of the matrix, i.e. the data is copied
-    UMat clone() const;
+    UMat clone() const CV_NODISCARD;
    //! copies the matrix content to "m".
    // It calls m.create(this->size(), this->type()).
    void copyTo( OutputArray m ) const;
@@ -2513,16 +2531,14 @@ public:
    //! returns N if the matrix is 1-channel (N x ptdim) or ptdim-channel (1 x N) or (N x 1); negative number otherwise
    int checkVector(int elemChannels, int depth=-1, bool requireContinuous=true) const;

-#ifdef CV_CXX_MOVE_SEMANTICS
    UMat(UMat&& m);
    UMat& operator = (UMat&& m);
-#endif

    /*! Returns the OpenCL buffer handle on which UMat operates on.
        The UMat instance should be kept alive during the use of the handle to prevent the buffer to be
        returned to the OpenCV buffer pool.
     */
-    void* handle(int accessFlags) const;
+    void* handle(AccessFlag accessFlags) const;
    void ndoffset(size_t* ofs) const;

    enum { MAGIC_VAL  = 0x42FF0000, AUTO_STEP = 0, CONTINUOUS_FLAG = CV_MAT_CONT_FLAG, SUBMATRIX_FLAG = CV_SUBMAT_FLAG };
@@ -2546,6 +2562,9 @@ public:
    //! and the standard allocator
    static MatAllocator* getStdAllocator();

+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
    // black-box container of UMat data
    UMatData* u;

@@ -2707,7 +2726,7 @@ public:
    SparseMat& operator = (const Mat& m);

    //! creates full copy of the matrix
-    SparseMat clone() const;
+    SparseMat clone() const CV_NODISCARD;

    //! copies all the data to the destination matrix. All the previous content of m is erased
    void copyTo( SparseMat& m ) const;
@@ -2926,7 +2945,7 @@ public:

    //! the default constructor
    SparseMat_();
-    //! the full constructor equivelent to SparseMat(dims, _sizes, DataType<_Tp>::type)
+    //! the full constructor equivalent to SparseMat(dims, _sizes, DataType<_Tp>::type)
    SparseMat_(int dims, const int* _sizes);
    //! the copy constructor. If DataType<_Tp>.type != m.type(), the m elements are converted
    SparseMat_(const SparseMat& m);
@@ -2944,7 +2963,7 @@ public:
    SparseMat_& operator = (const Mat& m);

    //! makes full copy of the matrix. All the elements are duplicated
-    SparseMat_ clone() const;
+    SparseMat_ clone() const CV_NODISCARD;
    //! equivalent to cv::SparseMat::create(dims, _sizes, DataType<_Tp>::type)
    void create(int dims, const int* _sizes);
    //! converts sparse matrix to the old-style CvSparseMat. All the elements are copied
@@ -3540,6 +3559,10 @@ CV_EXPORTS MatExpr operator + (const Mat& m, const MatExpr& e);
 CV_EXPORTS MatExpr operator + (const MatExpr& e, const Scalar& s);
 CV_EXPORTS MatExpr operator + (const Scalar& s, const MatExpr& e);
 CV_EXPORTS MatExpr operator + (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator + (const Mat& a, const Matx<_Tp, m, n>& b) { return a + Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator + (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) + b; }

 CV_EXPORTS MatExpr operator - (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator - (const Mat& a, const Scalar& s);
@@ -3549,6 +3572,10 @@ CV_EXPORTS MatExpr operator - (const Mat& m, const MatExpr& e);
 CV_EXPORTS MatExpr operator - (const MatExpr& e, const Scalar& s);
 CV_EXPORTS MatExpr operator - (const Scalar& s, const MatExpr& e);
 CV_EXPORTS MatExpr operator - (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator - (const Mat& a, const Matx<_Tp, m, n>& b) { return a - Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator - (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) - b; }

 CV_EXPORTS MatExpr operator - (const Mat& m);
 CV_EXPORTS MatExpr operator - (const MatExpr& e);
@@ -3561,6 +3588,10 @@ CV_EXPORTS MatExpr operator * (const Mat& m, const MatExpr& e);
 CV_EXPORTS MatExpr operator * (const MatExpr& e, double s);
 CV_EXPORTS MatExpr operator * (double s, const MatExpr& e);
 CV_EXPORTS MatExpr operator * (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator * (const Mat& a, const Matx<_Tp, m, n>& b) { return a * Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator * (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) * b; }

 CV_EXPORTS MatExpr operator / (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator / (const Mat& a, double s);
@@ -3570,52 +3601,100 @@ CV_EXPORTS MatExpr operator / (const Mat& m, const MatExpr& e);
 CV_EXPORTS MatExpr operator / (const MatExpr& e, double s);
 CV_EXPORTS MatExpr operator / (double s, const MatExpr& e);
 CV_EXPORTS MatExpr operator / (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator / (const Mat& a, const Matx<_Tp, m, n>& b) { return a / Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator / (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) / b; }

 CV_EXPORTS MatExpr operator < (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator < (const Mat& a, double s);
 CV_EXPORTS MatExpr operator < (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator < (const Mat& a, const Matx<_Tp, m, n>& b) { return a < Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator < (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) < b; }

 CV_EXPORTS MatExpr operator <= (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator <= (const Mat& a, double s);
 CV_EXPORTS MatExpr operator <= (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator <= (const Mat& a, const Matx<_Tp, m, n>& b) { return a <= Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator <= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) <= b; }

 CV_EXPORTS MatExpr operator == (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator == (const Mat& a, double s);
 CV_EXPORTS MatExpr operator == (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator == (const Mat& a, const Matx<_Tp, m, n>& b) { return a == Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator == (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) == b; }

 CV_EXPORTS MatExpr operator != (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator != (const Mat& a, double s);
 CV_EXPORTS MatExpr operator != (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator != (const Mat& a, const Matx<_Tp, m, n>& b) { return a != Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator != (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) != b; }

 CV_EXPORTS MatExpr operator >= (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator >= (const Mat& a, double s);
 CV_EXPORTS MatExpr operator >= (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator >= (const Mat& a, const Matx<_Tp, m, n>& b) { return a >= Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator >= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) >= b; }

 CV_EXPORTS MatExpr operator > (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator > (const Mat& a, double s);
 CV_EXPORTS MatExpr operator > (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator > (const Mat& a, const Matx<_Tp, m, n>& b) { return a > Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator > (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) > b; }

 CV_EXPORTS MatExpr operator & (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator & (const Mat& a, const Scalar& s);
 CV_EXPORTS MatExpr operator & (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator & (const Mat& a, const Matx<_Tp, m, n>& b) { return a & Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator & (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) & b; }

 CV_EXPORTS MatExpr operator | (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator | (const Mat& a, const Scalar& s);
 CV_EXPORTS MatExpr operator | (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator | (const Mat& a, const Matx<_Tp, m, n>& b) { return a | Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator | (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) | b; }

 CV_EXPORTS MatExpr operator ^ (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator ^ (const Mat& a, const Scalar& s);
 CV_EXPORTS MatExpr operator ^ (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator ^ (const Mat& a, const Matx<_Tp, m, n>& b) { return a ^ Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator ^ (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) ^ b; }

 CV_EXPORTS MatExpr operator ~(const Mat& m);

 CV_EXPORTS MatExpr min(const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr min(const Mat& a, double s);
 CV_EXPORTS MatExpr min(double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr min (const Mat& a, const Matx<_Tp, m, n>& b) { return min(a, Mat(b)); }
+template<typename _Tp, int m, int n> static inline
+MatExpr min (const Matx<_Tp, m, n>& a, const Mat& b) { return min(Mat(a), b); }

 CV_EXPORTS MatExpr max(const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr max(const Mat& a, double s);
 CV_EXPORTS MatExpr max(double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr max (const Mat& a, const Matx<_Tp, m, n>& b) { return max(a, Mat(b)); }
+template<typename _Tp, int m, int n> static inline
+MatExpr max (const Matx<_Tp, m, n>& a, const Mat& b) { return max(Mat(a), b); }

 /** @brief Calculates an absolute value of each matrix element.

--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/mat.inl.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/mat.inl.hpp
@@ -61,6 +61,16 @@ CV__DEBUG_NS_BEGIN

 //! @cond IGNORED

+////////////////////////// Custom (raw) type wrapper //////////////////////////
+
+template<typename _Tp> static inline
+int rawType()
+{
+    CV_StaticAssert(sizeof(_Tp) <= CV_CN_MAX, "sizeof(_Tp) is too large");
+    const int elemSize = sizeof(_Tp);
+    return (int)CV_MAKETYPE(CV_8U, elemSize);
+}
+
 //////////////////////// Input/Output Arrays ////////////////////////

 inline void _InputArray::init(int _flags, const void* _obj)
@@ -73,7 +83,7 @@ inline void* _InputArray::getObj() const { return obj; }
 inline int _InputArray::getFlags() const { return flags; }
 inline Size _InputArray::getSz() const { return sz; }

-inline _InputArray::_InputArray() { init(NONE, 0); }
+inline _InputArray::_InputArray() { init(0 + NONE, 0); }
 inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
 inline _InputArray::_InputArray(const Mat& m) { init(MAT+ACCESS_READ, &m); }
 inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_READ, &vec); }
@@ -84,7 +94,6 @@ template<typename _Tp> inline
 _InputArray::_InputArray(const std::vector<_Tp>& vec)
 { init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_READ, &vec); }

-#ifdef CV_CXX_STD_ARRAY
 template<typename _Tp, std::size_t _Nm> inline
 _InputArray::_InputArray(const std::array<_Tp, _Nm>& arr)
 { init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_READ, arr.data(), Size(1, _Nm)); }
@@ -92,7 +101,6 @@ _InputArray::_InputArray(const std::array<_Tp, _Nm>& arr)
 template<std::size_t _Nm> inline
 _InputArray::_InputArray(const std::array<Mat, _Nm>& arr)
 { init(STD_ARRAY_MAT + ACCESS_READ, arr.data(), Size(1, _Nm)); }
-#endif

 inline
 _InputArray::_InputArray(const std::vector<bool>& vec)
@@ -102,10 +110,6 @@ template<typename _Tp> inline
 _InputArray::_InputArray(const std::vector<std::vector<_Tp> >& vec)
 { init(FIXED_TYPE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_READ, &vec); }

-inline
-_InputArray::_InputArray(const std::vector<std::vector<bool> >&)
-{ CV_Error(Error::StsUnsupportedFormat, "std::vector<std::vector<bool> > is not supported!\n"); }
-
 template<typename _Tp> inline
 _InputArray::_InputArray(const std::vector<Mat_<_Tp> >& vec)
 { init(FIXED_TYPE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_READ, &vec); }
@@ -140,6 +144,25 @@ inline _InputArray::_InputArray(const ogl::Buffer& buf)
 inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
 { init(CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }

+template<typename _Tp> inline
+_InputArray _InputArray::rawIn(const std::vector<_Tp>& vec)
+{
+    _InputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_READ;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputArray _InputArray::rawIn(const std::array<_Tp, _Nm>& arr)
+{
+    _InputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_READ;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
 inline _InputArray::~_InputArray() {}

 inline Mat _InputArray::getMat(int i) const
@@ -157,22 +180,22 @@ inline bool _InputArray::isMatx() const { return kind() == _InputArray::MATX; }
 inline bool _InputArray::isVector() const { return kind() == _InputArray::STD_VECTOR ||
                                                   kind() == _InputArray::STD_BOOL_VECTOR ||
                                                   kind() == _InputArray::STD_ARRAY; }
+inline bool _InputArray::isGpuMat() const { return kind() == _InputArray::CUDA_GPU_MAT; }
 inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT; }

 ////////////////////////////////////////////////////////////////////////////////////////

-inline _OutputArray::_OutputArray() { init(ACCESS_WRITE, 0); }
-inline _OutputArray::_OutputArray(int _flags, void* _obj) { init(_flags|ACCESS_WRITE, _obj); }
+inline _OutputArray::_OutputArray() { init(NONE + ACCESS_WRITE, 0); }
+inline _OutputArray::_OutputArray(int _flags, void* _obj) { init(_flags + ACCESS_WRITE, _obj); }
 inline _OutputArray::_OutputArray(Mat& m) { init(MAT+ACCESS_WRITE, &m); }
-inline _OutputArray::_OutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_WRITE, &vec); }
-inline _OutputArray::_OutputArray(UMat& m) { init(UMAT+ACCESS_WRITE, &m); }
-inline _OutputArray::_OutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_WRITE, &vec); }
+inline _OutputArray::_OutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
+inline _OutputArray::_OutputArray(UMat& m) { init(UMAT + ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }

 template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<_Tp>& vec)
 { init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }

-#ifdef CV_CXX_STD_ARRAY
 template<typename _Tp, std::size_t _Nm> inline
 _OutputArray::_OutputArray(std::array<_Tp, _Nm>& arr)
 { init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
@@ -180,20 +203,11 @@ _OutputArray::_OutputArray(std::array<_Tp, _Nm>& arr)
 template<std::size_t _Nm> inline
 _OutputArray::_OutputArray(std::array<Mat, _Nm>& arr)
 { init(STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
-#endif
-
-inline
-_OutputArray::_OutputArray(std::vector<bool>&)
-{ CV_Error(Error::StsUnsupportedFormat, "std::vector<bool> cannot be an output array\n"); }

 template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<std::vector<_Tp> >& vec)
 { init(FIXED_TYPE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }

-inline
-_OutputArray::_OutputArray(std::vector<std::vector<bool> >&)
-{ CV_Error(Error::StsUnsupportedFormat, "std::vector<std::vector<bool> > cannot be an output array\n"); }
-
 template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<Mat_<_Tp> >& vec)
 { init(FIXED_TYPE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
@@ -214,7 +228,6 @@ template<typename _Tp> inline
 _OutputArray::_OutputArray(const std::vector<_Tp>& vec)
 { init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }

-#ifdef CV_CXX_STD_ARRAY
 template<typename _Tp, std::size_t _Nm> inline
 _OutputArray::_OutputArray(const std::array<_Tp, _Nm>& arr)
 { init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
@@ -222,7 +235,6 @@ _OutputArray::_OutputArray(const std::array<_Tp, _Nm>& arr)
 template<std::size_t _Nm> inline
 _OutputArray::_OutputArray(const std::array<Mat, _Nm>& arr)
 { init(FIXED_SIZE + STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
-#endif

 template<typename _Tp> inline
 _OutputArray::_OutputArray(const std::vector<std::vector<_Tp> >& vec)
@@ -278,10 +290,29 @@ inline _OutputArray::_OutputArray(const ogl::Buffer& buf)
 inline _OutputArray::_OutputArray(const cuda::HostMem& cuda_mem)
 { init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }

+template<typename _Tp> inline
+_OutputArray _OutputArray::rawOut(std::vector<_Tp>& vec)
+{
+    _OutputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_WRITE;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_OutputArray _OutputArray::rawOut(std::array<_Tp, _Nm>& arr)
+{
+    _OutputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////

-inline _InputOutputArray::_InputOutputArray() { init(ACCESS_RW, 0); }
-inline _InputOutputArray::_InputOutputArray(int _flags, void* _obj) { init(_flags|ACCESS_RW, _obj); }
+inline _InputOutputArray::_InputOutputArray() { init(0+ACCESS_RW, 0); }
+inline _InputOutputArray::_InputOutputArray(int _flags, void* _obj) { init(_flags+ACCESS_RW, _obj); }
 inline _InputOutputArray::_InputOutputArray(Mat& m) { init(MAT+ACCESS_RW, &m); }
 inline _InputOutputArray::_InputOutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_RW, &vec); }
 inline _InputOutputArray::_InputOutputArray(UMat& m) { init(UMAT+ACCESS_RW, &m); }
@@ -291,7 +322,6 @@ template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec)
 { init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }

-#ifdef CV_CXX_STD_ARRAY
 template<typename _Tp, std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(std::array<_Tp, _Nm>& arr)
 { init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
@@ -299,10 +329,6 @@ _InputOutputArray::_InputOutputArray(std::array<_Tp, _Nm>& arr)
 template<std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(std::array<Mat, _Nm>& arr)
 { init(STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
-#endif
-
-inline _InputOutputArray::_InputOutputArray(std::vector<bool>&)
-{ CV_Error(Error::StsUnsupportedFormat, "std::vector<bool> cannot be an input/output array\n"); }

 template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(std::vector<std::vector<_Tp> >& vec)
@@ -328,7 +354,6 @@ template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(const std::vector<_Tp>& vec)
 { init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }

-#ifdef CV_CXX_STD_ARRAY
 template<typename _Tp, std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(const std::array<_Tp, _Nm>& arr)
 { init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
@@ -336,7 +361,6 @@ _InputOutputArray::_InputOutputArray(const std::array<_Tp, _Nm>& arr)
 template<std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(const std::array<Mat, _Nm>& arr)
 { init(FIXED_SIZE + STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
-#endif

 template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(const std::vector<std::vector<_Tp> >& vec)
@@ -394,6 +418,30 @@ inline _InputOutputArray::_InputOutputArray(const ogl::Buffer& buf)
 inline _InputOutputArray::_InputOutputArray(const cuda::HostMem& cuda_mem)
 { init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }

+template<typename _Tp> inline
+_InputOutputArray _InputOutputArray::rawInOut(std::vector<_Tp>& vec)
+{
+    _InputOutputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_RW;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputOutputArray _InputOutputArray::rawInOut(std::array<_Tp, _Nm>& arr)
+{
+    _InputOutputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
+
+template<typename _Tp> static inline _InputArray rawIn(_Tp& v) { return _InputArray::rawIn(v); }
+template<typename _Tp> static inline _OutputArray rawOut(_Tp& v) { return _OutputArray::rawOut(v); }
+template<typename _Tp> static inline _InputOutputArray rawInOut(_Tp& v) { return _InputOutputArray::rawInOut(v); }
+
 CV__DEBUG_NS_END

 //////////////////////////////////////////// Mat //////////////////////////////////////////
@@ -504,24 +552,20 @@ Mat::Mat(int _rows, int _cols, int _type, void* _data, size_t _step)
    if( _step == AUTO_STEP )
    {
        _step = minstep;
-        flags |= CONTINUOUS_FLAG;
    }
    else
    {
        CV_DbgAssert( _step >= minstep );
-
        if (_step % esz1 != 0)
        {
            CV_Error(Error::BadStep, "Step must be a multiple of esz1");
        }
-
-        if (_step == minstep || rows == 1)
-            flags |= CONTINUOUS_FLAG;
    }
    step[0] = _step;
    step[1] = esz;
    datalimit = datastart + _step * rows;
    dataend = datalimit - _step + minstep;
+    updateContinuityFlag();
 }

 inline
@@ -537,7 +581,6 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
    if( _step == AUTO_STEP )
    {
        _step = minstep;
-        flags |= CONTINUOUS_FLAG;
    }
    else
    {
@@ -547,19 +590,17 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
        {
            CV_Error(Error::BadStep, "Step must be a multiple of esz1");
        }
-
-        if (_step == minstep || rows == 1)
-            flags |= CONTINUOUS_FLAG;
    }
    step[0] = _step;
    step[1] = esz;
    datalimit = datastart + _step*rows;
    dataend = datalimit - _step + minstep;
+    updateContinuityFlag();
 }

 template<typename _Tp> inline
 Mat::Mat(const std::vector<_Tp>& vec, bool copyData)
-    : flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
      cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
    if(vec.empty())
@@ -574,22 +615,29 @@ Mat::Mat(const std::vector<_Tp>& vec, bool copyData)
        Mat((int)vec.size(), 1, traits::Type<_Tp>::value, (uchar*)&vec[0]).copyTo(*this);
 }

-#ifdef CV_CXX11
 template<typename _Tp, typename> inline
 Mat::Mat(const std::initializer_list<_Tp> list)
-    : flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows((int)list.size()),
-      cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+    : Mat()
 {
-    if(list.size() == 0)
-        return;
+    CV_Assert(list.size() != 0);
    Mat((int)list.size(), 1, traits::Type<_Tp>::value, (uchar*)list.begin()).copyTo(*this);
 }
-#endif

-#ifdef CV_CXX_STD_ARRAY
+template<typename _Tp> inline
+Mat::Mat(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> list)
+    : Mat()
+{
+    size_t size_total = 1;
+    for(auto s : sizes)
+        size_total *= s;
+    CV_Assert(list.size() != 0);
+    CV_Assert(size_total == list.size());
+    Mat((int)sizes.size(), (int*)sizes.begin(), traits::Type<_Tp>::value, (uchar*)list.begin()).copyTo(*this);
+}
+
 template<typename _Tp, std::size_t _Nm> inline
 Mat::Mat(const std::array<_Tp, _Nm>& arr, bool copyData)
-    : flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows((int)arr.size()),
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)arr.size()),
      cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
    if(arr.empty())
@@ -603,11 +651,10 @@ Mat::Mat(const std::array<_Tp, _Nm>& arr, bool copyData)
    else
        Mat((int)arr.size(), 1, traits::Type<_Tp>::value, (uchar*)arr.data()).copyTo(*this);
 }
-#endif

 template<typename _Tp, int n> inline
 Mat::Mat(const Vec<_Tp, n>& vec, bool copyData)
-    : flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows(n), cols(1), data(0),
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(n), cols(1), data(0),
      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
    if( !copyData )
@@ -623,7 +670,7 @@ Mat::Mat(const Vec<_Tp, n>& vec, bool copyData)

 template<typename _Tp, int m, int n> inline
 Mat::Mat(const Matx<_Tp,m,n>& M, bool copyData)
-    : flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows(m), cols(n), data(0),
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(m), cols(n), data(0),
      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
    if( !copyData )
@@ -639,7 +686,7 @@ Mat::Mat(const Matx<_Tp,m,n>& M, bool copyData)

 template<typename _Tp> inline
 Mat::Mat(const Point_<_Tp>& pt, bool copyData)
-    : flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows(2), cols(1), data(0),
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(2), cols(1), data(0),
      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
    if( !copyData )
@@ -658,7 +705,7 @@ Mat::Mat(const Point_<_Tp>& pt, bool copyData)

 template<typename _Tp> inline
 Mat::Mat(const Point3_<_Tp>& pt, bool copyData)
-    : flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows(3), cols(1), data(0),
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(3), cols(1), data(0),
      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
    if( !copyData )
@@ -678,7 +725,7 @@ Mat::Mat(const Point3_<_Tp>& pt, bool copyData)

 template<typename _Tp> inline
 Mat::Mat(const MatCommaInitializer_<_Tp>& commaInitializer)
-    : flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(0), rows(0), cols(0), data(0),
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(0), rows(0), cols(0), data(0),
      datastart(0), dataend(0), allocator(0), u(0), size(&rows)
 {
    *this = commaInitializer.operator Mat_<_Tp>();
@@ -857,7 +904,9 @@ bool Mat::isSubmatrix() const
 inline
 size_t Mat::elemSize() const
 {
-    return dims > 0 ? step.p[dims - 1] : 0;
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
 }

 inline
@@ -942,7 +991,7 @@ _Tp* Mat::ptr(int y)
 template<typename _Tp> inline
 const _Tp* Mat::ptr(int y) const
 {
-    CV_DbgAssert( y == 0 || (data && dims >= 1 && data && (unsigned)y < (unsigned)size.p[0]) );
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
    return (const _Tp*)(data + step.p[0] * y);
 }

@@ -1253,7 +1302,6 @@ Mat::operator std::vector<_Tp>() const
    return v;
 }

-#ifdef CV_CXX_STD_ARRAY
 template<typename _Tp, std::size_t _Nm> inline
 Mat::operator std::array<_Tp, _Nm>() const
 {
@@ -1261,7 +1309,6 @@ Mat::operator std::array<_Tp, _Nm>() const
    copyTo(v);
    return v;
 }
-#endif

 template<typename _Tp, int n> inline
 Mat::operator Vec<_Tp, n>() const
@@ -1329,8 +1376,6 @@ void Mat::push_back(const std::vector<_Tp>& v)
    push_back(Mat(v));
 }

-#ifdef CV_CXX_MOVE_SEMANTICS
-
 inline
 Mat::Mat(Mat&& m)
    : flags(m.flags), dims(m.dims), rows(m.rows), cols(m.cols), data(m.data),
@@ -1392,8 +1437,6 @@ Mat& Mat::operator = (Mat&& m)
    return *this;
 }

-#endif
-

 ///////////////////////////// MatSize ////////////////////////////

@@ -1401,22 +1444,36 @@ inline
 MatSize::MatSize(int* _p)
    : p(_p) {}

+inline
+int MatSize::dims() const
+{
+    return (p - 1)[0];
+}
+
 inline
 Size MatSize::operator()() const
 {
-    CV_DbgAssert(p[-1] <= 2);
+    CV_DbgAssert(dims() <= 2);
    return Size(p[1], p[0]);
 }

 inline
 const int& MatSize::operator[](int i) const
 {
+    CV_DbgAssert(i < dims());
+#ifdef __OPENCV_BUILD
+    CV_DbgAssert(i >= 0);
+#endif
    return p[i];
 }

 inline
 int& MatSize::operator[](int i)
 {
+    CV_DbgAssert(i < dims());
+#ifdef __OPENCV_BUILD
+    CV_DbgAssert(i >= 0);
+#endif
    return p[i];
 }

@@ -1429,8 +1486,8 @@ MatSize::operator const int*() const
 inline
 bool MatSize::operator == (const MatSize& sz) const
 {
-    int d = p[-1];
-    int dsz = sz.p[-1];
+    int d = dims();
+    int dsz = sz.dims();
    if( d != dsz )
        return false;
    if( d == 2 )
@@ -1497,7 +1554,7 @@ template<typename _Tp> inline
 Mat_<_Tp>::Mat_()
    : Mat()
 {
-    flags = (flags & ~CV_MAT_TYPE_MASK) | traits::Type<_Tp>::value;
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
 }

 template<typename _Tp> inline
@@ -1554,7 +1611,7 @@ template<typename _Tp> inline
 Mat_<_Tp>::Mat_(const Mat& m)
    : Mat()
 {
-    flags = (flags & ~CV_MAT_TYPE_MASK) | traits::Type<_Tp>::value;
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
    *this = m;
 }

@@ -1624,19 +1681,20 @@ Mat_<_Tp>::Mat_(const std::vector<_Tp>& vec, bool copyData)
    : Mat(vec, copyData)
 {}

-#ifdef CV_CXX11
 template<typename _Tp> inline
 Mat_<_Tp>::Mat_(std::initializer_list<_Tp> list)
    : Mat(list)
 {}
-#endif

-#ifdef CV_CXX_STD_ARRAY
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const std::initializer_list<int> sizes, std::initializer_list<_Tp> list)
+    : Mat(sizes, list)
+{}
+
 template<typename _Tp> template<std::size_t _Nm> inline
 Mat_<_Tp>::Mat_(const std::array<_Tp, _Nm>& arr, bool copyData)
    : Mat(arr, copyData)
 {}
-#endif

 template<typename _Tp> inline
 Mat_<_Tp>& Mat_<_Tp>::operator = (const Mat& m)
@@ -1650,7 +1708,7 @@ Mat_<_Tp>& Mat_<_Tp>::operator = (const Mat& m)
    {
        return (*this = m.reshape(DataType<_Tp>::channels, m.dims, 0));
    }
-    CV_DbgAssert(DataType<_Tp>::channels == m.channels());
+    CV_Assert(DataType<_Tp>::channels == m.channels() || m.empty());
    m.convertTo(*this, type());
    return *this;
 }
@@ -1693,7 +1751,7 @@ void Mat_<_Tp>::release()
 {
    Mat::release();
 #ifdef _DEBUG
-    flags = (flags & ~CV_MAT_TYPE_MASK) | traits::Type<_Tp>::value;
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
 #endif
 }

@@ -1924,7 +1982,6 @@ Mat_<_Tp>::operator std::vector<_Tp>() const
    return v;
 }

-#ifdef CV_CXX_STD_ARRAY
 template<typename _Tp> template<std::size_t _Nm> inline
 Mat_<_Tp>::operator std::array<_Tp, _Nm>() const
 {
@@ -1932,7 +1989,6 @@ Mat_<_Tp>::operator std::array<_Tp, _Nm>() const
    copyTo(a);
    return a;
 }
-#endif

 template<typename _Tp> template<int n> inline
 Mat_<_Tp>::operator Vec<typename DataType<_Tp>::channel_type, n>() const
@@ -1996,8 +2052,6 @@ void Mat_<_Tp>::forEach(const Functor& operation) const {
    Mat::forEach<_Tp, Functor>(operation);
 }

-#ifdef CV_CXX_MOVE_SEMANTICS
-
 template<typename _Tp> inline
 Mat_<_Tp>::Mat_(Mat_&& m)
    : Mat(m)
@@ -2015,7 +2069,7 @@ template<typename _Tp> inline
 Mat_<_Tp>::Mat_(Mat&& m)
    : Mat()
 {
-    flags = (flags & ~CV_MAT_TYPE_MASK) | traits::Type<_Tp>::value;
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
    *this = m;
 }

@@ -2041,11 +2095,10 @@ template<typename _Tp> inline
 Mat_<_Tp>::Mat_(MatExpr&& e)
    : Mat()
 {
-    flags = (flags & ~CV_MAT_TYPE_MASK) | traits::Type<_Tp>::value;
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
    *this = Mat(e);
 }

-#endif

 ///////////////////////////// SparseMat /////////////////////////////

@@ -2378,7 +2431,7 @@ SparseMatConstIterator_<_Tp> SparseMat::end() const
 template<typename _Tp> inline
 SparseMat_<_Tp>::SparseMat_()
 {
-    flags = MAGIC_VAL | traits::Type<_Tp>::value;
+    flags = MAGIC_VAL + traits::Type<_Tp>::value;
 }

 template<typename _Tp> inline
@@ -3601,7 +3654,7 @@ UMat::UMat(const UMat& m)

 template<typename _Tp> inline
 UMat::UMat(const std::vector<_Tp>& vec, bool copyData)
-: flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
+: flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
 cols(1), allocator(0), usageFlags(USAGE_DEFAULT), u(0), offset(0), size(&rows)
 {
    if(vec.empty())
@@ -3766,7 +3819,9 @@ bool UMat::isSubmatrix() const
 inline
 size_t UMat::elemSize() const
 {
-    return dims > 0 ? step.p[dims - 1] : 0;
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
 }

 inline
@@ -3816,8 +3871,6 @@ size_t UMat::total() const
    return p;
 }

-#ifdef CV_CXX_MOVE_SEMANTICS
-
 inline
 UMat::UMat(UMat&& m)
 : flags(m.flags), dims(m.dims), rows(m.rows), cols(m.cols), allocator(m.allocator),
@@ -3878,8 +3931,6 @@ UMat& UMat::operator = (UMat&& m)
    return *this;
 }

-#endif
-

 inline bool UMatData::hostCopyObsolete() const { return (flags & HOST_COPY_OBSOLETE) != 0; }
 inline bool UMatData::deviceCopyObsolete() const { return (flags & DEVICE_COPY_OBSOLETE) != 0; }
@@ -3911,9 +3962,6 @@ inline void UMatData::markDeviceCopyObsolete(bool flag)
        flags &= ~DEVICE_COPY_OBSOLETE;
 }

-inline UMatDataAutoLock::UMatDataAutoLock(UMatData* _u) : u(_u) { u->lock(); }
-inline UMatDataAutoLock::~UMatDataAutoLock() { u->unlock(); }
-
 //! @endcond

 } //cv
--- a/lib/3rdParty/OpenCV3.4/include/opencv2/core/matx.hpp
+++ b/lib/3rdParty/OpenCV3.4/include/opencv2/core/matx.hpp
@@ -53,9 +53,7 @@
 #include "opencv2/core/traits.hpp"
 #include "opencv2/core/saturate.hpp"

-#ifdef CV_CXX11
 #include <initializer_list>
-#endif

 namespace cv
 {
@@ -66,13 +64,14 @@ namespace cv
 ////////////////////////////// Small Matrix ///////////////////////////

 //! @cond IGNORED
-struct CV_EXPORTS Matx_AddOp {};
-struct CV_EXPORTS Matx_SubOp {};
-struct CV_EXPORTS Matx_ScaleOp {};
-struct CV_EXPORTS Matx_MulOp {};
-struct CV_EXPORTS Matx_DivOp {};
-struct CV_EXPORTS Matx_MatMulOp {};
-struct CV_EXPORTS Matx_TOp {};
+// FIXIT Remove this (especially CV_EXPORTS modifier)
+struct CV_EXPORTS Matx_AddOp { Matx_AddOp() {} Matx_AddOp(const Matx_AddOp&) {} };
+struct CV_EXPORTS Matx_SubOp { Matx_SubOp() {} Matx_SubOp(const Matx_SubOp&) {} };
+struct CV_EXPORTS Matx_ScaleOp { Matx_ScaleOp() {} Matx_ScaleOp(const Matx_ScaleOp&) {} };
+struct CV_EXPORTS Matx_MulOp { Matx_MulOp() {} Matx_MulOp(const Matx_MulOp&) {} };
+struct CV_EXPORTS Matx_DivOp { Matx_DivOp() {} Matx_DivOp(const Matx_DivOp&) {} };
+struct CV_EXPORTS Matx_MatMulOp { Matx_MatMulOp() {} Matx_MatMulOp(const Matx_MatMulOp&) {} };
+struct CV_EXPORTS Matx_TOp { Matx_TOp() {} Matx_TOp(const Matx_TOp&) {} };
 //! @endcond

 /** @brief Template class for small matrices whose type and size are known at compilation time
@@ -92,7 +91,7 @@ Except of the plain constructor which takes a list of elements, Matx can be init
    float values[] = { 1, 2, 3};
    Matx31f m(values);
@endcode
-In case if C++11 features are avaliable, std::initializer_list can be also used to initizlize Matx:
+In case if C++11 features are available, std::initializer_list can be also used to initialize Matx:
@code{.cpp}
    Matx31f m = { 1, 2, 3};
@endcode
@@ -118,7 +117,7 @@ public:
    //! default constructor
    Matx();

-    Matx(_Tp v0); //!< 1x1 matrix
+    explicit Matx(_Tp v0); //!< 1x1 matrix
    Matx(_Tp v0, _Tp v1); //!< 1x2 or 2x1 matrix
    Matx(_Tp v0, _Tp v1, _Tp v2); //!< 1x3 or 3x1 matrix
    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3); //!< 1x4, 2x2 or 4x1 matrix
@@ -141,9 +140,7 @@ public:
         _Tp v12, _Tp v13, _Tp v14, _Tp v15); //!< 1x16, 4x4 or 16x1 matrix
    explicit Matx(const _Tp* vals); //!< initialize from a plain array

-#ifdef CV_CXX11
    Matx(std::initializer_list<_Tp>); //!< initialize from an initializer list
-#endif

    static Matx all(_Tp alpha);
    static Matx zeros();
@@ -166,7 +163,7 @@ public:
    template<int m1, int n1> Matx<_Tp, m1, n1> reshape() const;

    //! extract part of the matrix
-    template<int m1, int n1> Matx<_Tp, m1, n1> get_minor(int i, int j) const;
+    template<int m1, int n1> Matx<_Tp, m1, n1> get_minor(int base_row, int base_col) const;

    //! extract the matrix row
    Matx<_Tp, 1, n> row(int i) const;
@@ -194,8 +191,8 @@ public:
    Matx<_Tp, m, n> div(const Matx<_Tp, m, n>& a) const;

    //! element access
-    const _Tp& operator ()(int i, int j) const;
-    _Tp& operator ()(int i, int j);
+    const _Tp& operator ()(int row, int col) const;
+    _Tp& operator ()(int row, int col);

    //! 1D element access
    const _Tp& operator ()(int i) const;
@@ -361,9 +358,7 @@ public:
    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13); //!< 14-element vector constructor
    explicit Vec(const _Tp* values);

-#ifdef CV_CXX11
    Vec(std::initializer_list<_Tp>);
-#endif

    Vec(const Vec<_Tp, cn>& v);

@@ -665,7 +660,6 @@ Matx<_Tp, m, n>::Matx(const _Tp* values)
    for( int i = 0; i < channels; i++ ) val[i] = values[i];
 }

-#ifdef CV_CXX11
 template<typename _Tp, int m, int n> inline
 Matx<_Tp, m, n>::Matx(std::initializer_list<_Tp> list)
 {
@@ -676,7 +670,6 @@ Matx<_Tp, m, n>::Matx(std::initializer_list<_Tp> list)
        val[i++] = elem;
    }
 }
-#endif

 template<typename _Tp, int m, int n> inline
 Matx<_Tp, m, n> Matx<_Tp, m, n>::all(_Tp alpha)
@@ -749,13 +742,13 @@ Matx<_Tp, m1, n1> Matx<_Tp, m, n>::reshape() const

 template<typename _Tp, int m, int n>
 template<int m1, int n1> inline
-Matx<_Tp, m1, n1> Matx<_Tp, m, n>::get_minor(int i, int j) const
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::get_minor(int base_row, int base_col) const
 {
-    CV_DbgAssert(0 <= i && i+m1 <= m && 0 <= j && j+n1 <= n);
+    CV_DbgAssert(0 <= base_row && base_row+m1 <= m && 0 <= base_col && base_col+n1 <= n);
    Matx<_Tp, m1, n1> s;
    for( int di = 0; di < m1; di++ )
        for( int dj = 0; dj < n1; dj++ )
-            s(di, dj) = (*this)(i+di, j+dj);
+            s(di, dj) = (*this)(base_row+di, base_col+dj);
    return s;
 }

@@ -786,17 +779,17 @@ typename Matx<_Tp, m, n>::diag_type Matx<_Tp, m, n>::diag() const
 }

 template<typename _Tp, int m, int n> inline
-const _Tp& Matx<_Tp, m, n>::operator()(int i, int j) const
+const _Tp& Matx<_Tp, m, n>::operator()(int row_idx, int col_idx) const
 {
-    CV_DbgAssert( (unsigned)i < (unsigned)m && (unsigned)j < (unsigned)n );
-    return this->val[i*n + j];
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return this->val[row_idx*n + col_idx];
 }

 template<typename _Tp, int m, int n> inline
-_Tp& Matx<_Tp, m, n>::operator ()(int i, int j)
+_Tp& Matx<_Tp, m, n>::operator ()(int row_idx, int col_idx)
 {
-    CV_DbgAssert( (unsigned)i < (unsigned)m && (unsigned)j < (unsigned)n );
-    return val[i*n + j];
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return val[row_idx*n + col_idx];
 }

 template<typename _Tp, int m, int n> inline
@@ -1019,11 +1012,9 @@ template<typename _Tp, int cn> inline
 Vec<_Tp, cn>::Vec(const _Tp* values)
    : Matx<_Tp, cn, 1>(values) {}

-#ifdef CV_CXX11
 template<typename _Tp, int cn> inline
 Vec<_Tp, cn>::Vec(std::initializer_list<_Tp> list)
    : Matx<_Tp, cn, 1>(list) {}
-#endif

 template<typename _Tp, int cn> inline
 Vec<_Tp, cn>::Vec(const Vec<_Tp, cn>& m)
--- a/Show More
+++ b/Show More