diff --git a/SSH/README.md b/SSH/README.md
new file mode 100644
index 0000000..98f7973
--- /dev/null
+++ b/SSH/README.md
@@ -0,0 +1 @@
+SSH, single stage face detector
diff --git a/SSH/__init__.py b/SSH/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/SSH/rcnn/__init__.py b/SSH/rcnn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/SSH/rcnn/cython/.gitignore b/SSH/rcnn/cython/.gitignore
new file mode 100644
index 0000000..15a165d
--- /dev/null
+++ b/SSH/rcnn/cython/.gitignore
@@ -0,0 +1,3 @@
+*.c
+*.cpp
+*.so
diff --git a/SSH/rcnn/cython/__init__.py b/SSH/rcnn/cython/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/SSH/rcnn/cython/anchors.pyx b/SSH/rcnn/cython/anchors.pyx
new file mode 100755
index 0000000..7005199
--- /dev/null
+++ b/SSH/rcnn/cython/anchors.pyx
@@ -0,0 +1,35 @@
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = np.float32
+ctypedef np.float32_t DTYPE_t
+
+def anchors_cython(int height, int width, int stride, np.ndarray[DTYPE_t, ndim=2] base_anchors):
+    """
+    Parameters
+    ----------
+    height: height of plane
+    width:  width of plane
+    stride: stride ot the original image
+    anchors_base: (A, 4) a base set of anchors
+    Returns
+    -------
+    all_anchors: (height, width, A, 4) ndarray of anchors spreading over the plane
+    """
+    cdef unsigned int A = base_anchors.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=4] all_anchors = np.zeros((height, width, A, 4), dtype=DTYPE)
+    cdef unsigned int iw, ih
+    cdef unsigned int k
+    cdef unsigned int sh
+    cdef unsigned int sw
+    for iw in range(width):
+        sw = iw * stride
+        for ih in range(height):
+            sh = ih * stride
+            for k in range(A):
+                all_anchors[ih, iw, k, 0] = base_anchors[k, 0] + sw
+                all_anchors[ih, iw, k, 1] = base_anchors[k, 1] + sh
+                all_anchors[ih, iw, k, 2] = base_anchors[k, 2] + sw
+                all_anchors[ih, iw, k, 3] = base_anchors[k, 3] + sh
+    return all_anchors
\ No newline at end of file
diff --git a/SSH/rcnn/cython/bbox.pyx b/SSH/rcnn/cython/bbox.pyx
new file mode 100644
index 0000000..0c49e12
--- /dev/null
+++ b/SSH/rcnn/cython/bbox.pyx
@@ -0,0 +1,55 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = np.float
+ctypedef np.float_t DTYPE_t
+
+def bbox_overlaps_cython(
+        np.ndarray[DTYPE_t, ndim=2] boxes,
+        np.ndarray[DTYPE_t, ndim=2] query_boxes):
+    """
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+    cdef DTYPE_t iw, ih, box_area
+    cdef DTYPE_t ua
+    cdef unsigned int k, n
+    for k in range(K):
+        box_area = (
+            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        )
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + 1
+            )
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + 1
+                )
+                if ih > 0:
+                    ua = float(
+                        (boxes[n, 2] - boxes[n, 0] + 1) *
+                        (boxes[n, 3] - boxes[n, 1] + 1) +
+                        box_area - iw * ih
+                    )
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
diff --git a/SSH/rcnn/cython/cpu_nms.pyx b/SSH/rcnn/cython/cpu_nms.pyx
new file mode 100644
index 0000000..1d0bef3
--- /dev/null
+++ b/SSH/rcnn/cython/cpu_nms.pyx
@@ -0,0 +1,68 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+
+def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+    cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+    cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+    cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+    cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+    cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+
+    cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+
+    cdef int ndets = dets.shape[0]
+    cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+            np.zeros((ndets), dtype=np.int)
+
+    # nominal indices
+    cdef int _i, _j
+    # sorted indices
+    cdef int i, j
+    # temp variables for box i's (the box currently under consideration)
+    cdef np.float32_t ix1, iy1, ix2, iy2, iarea
+    # variables for computing overlap with box j (lower scoring box)
+    cdef np.float32_t xx1, yy1, xx2, yy2
+    cdef np.float32_t w, h
+    cdef np.float32_t inter, ovr
+
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        keep.append(i)
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+
+    return keep
diff --git a/SSH/rcnn/cython/gpu_nms.hpp b/SSH/rcnn/cython/gpu_nms.hpp
new file mode 100644
index 0000000..68b6d42
--- /dev/null
+++ b/SSH/rcnn/cython/gpu_nms.hpp
@@ -0,0 +1,2 @@
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id);
diff --git a/SSH/rcnn/cython/gpu_nms.pyx b/SSH/rcnn/cython/gpu_nms.pyx
new file mode 100644
index 0000000..59d84af
--- /dev/null
+++ b/SSH/rcnn/cython/gpu_nms.pyx
@@ -0,0 +1,31 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+assert sizeof(int) == sizeof(np.int32_t)
+
+cdef extern from "gpu_nms.hpp":
+    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
+
+def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
+            np.int32_t device_id=0):
+    cdef int boxes_num = dets.shape[0]
+    cdef int boxes_dim = dets.shape[1]
+    cdef int num_out
+    cdef np.ndarray[np.int32_t, ndim=1] \
+        keep = np.zeros(boxes_num, dtype=np.int32)
+    cdef np.ndarray[np.float32_t, ndim=1] \
+        scores = dets[:, 4]
+    cdef np.ndarray[np.int_t, ndim=1] \
+        order = scores.argsort()[::-1]
+    cdef np.ndarray[np.float32_t, ndim=2] \
+        sorted_dets = dets[order, :]
+    _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
+    keep = keep[:num_out]
+    return list(order[keep])
diff --git a/SSH/rcnn/cython/nms_kernel.cu b/SSH/rcnn/cython/nms_kernel.cu
new file mode 100644
index 0000000..038a590
--- /dev/null
+++ b/SSH/rcnn/cython/nms_kernel.cu
@@ -0,0 +1,144 @@
+// ------------------------------------------------------------------
+// Faster R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Shaoqing Ren
+// ------------------------------------------------------------------
+
+#include "gpu_nms.hpp"
+#include <vector>
+#include <iostream>
+
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    if (error != cudaSuccess) { \
+      std::cout << cudaGetErrorString(error) << std::endl; \
+    } \
+  } while (0)
+
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float *dev_boxes, unsigned long long *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+void _set_device(int device_id) {
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device == device_id) {
+    return;
+  }
+  // The call to cudaSetDevice must come before any calls to Get, which
+  // may perform initialization using the GPU.
+  CUDA_CHECK(cudaSetDevice(device_id));
+}
+
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id) {
+  _set_device(device_id);
+
+  float* boxes_dev = NULL;
+  unsigned long long* mask_dev = NULL;
+
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+
+  CUDA_CHECK(cudaMalloc(&boxes_dev,
+                        boxes_num * boxes_dim * sizeof(float)));
+  CUDA_CHECK(cudaMemcpy(boxes_dev,
+                        boxes_host,
+                        boxes_num * boxes_dim * sizeof(float),
+                        cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(cudaMalloc(&mask_dev,
+                        boxes_num * col_blocks * sizeof(unsigned long long)));
+
+  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+              DIVUP(boxes_num, threadsPerBlock));
+  dim3 threads(threadsPerBlock);
+  nms_kernel<<<blocks, threads>>>(boxes_num,
+                                  nms_overlap_thresh,
+                                  boxes_dev,
+                                  mask_dev);
+
+  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
+  CUDA_CHECK(cudaMemcpy(&mask_host[0],
+                        mask_dev,
+                        sizeof(unsigned long long) * boxes_num * col_blocks,
+                        cudaMemcpyDeviceToHost));
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  *num_out = num_to_keep;
+
+  CUDA_CHECK(cudaFree(boxes_dev));
+  CUDA_CHECK(cudaFree(mask_dev));
+}
diff --git a/SSH/rcnn/cython/setup.py b/SSH/rcnn/cython/setup.py
new file mode 100644
index 0000000..3e27add
--- /dev/null
+++ b/SSH/rcnn/cython/setup.py
@@ -0,0 +1,169 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import os
+from os.path import join as pjoin
+from setuptools import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+import numpy as np
+
+
+def find_in_path(name, path):
+    "Find a file in a search path"
+    # Adapted fom
+    # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+
+    Starts by looking for the CUDAHOME env variable. If not found, everything
+    is based on finding 'nvcc' in the PATH.
+    """
+
+    # first check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # otherwise, search the PATH for NVCC
+        default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
+        nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
+        if nvcc is None:
+            raise EnvironmentError('The nvcc binary could not be '
+                'located in your $PATH. Either add it to your path, or set $CUDAHOME')
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home':home, 'nvcc':nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    for k, v in cudaconfig.items():
+        if not os.path.exists(v):
+            raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
+
+    return cudaconfig
+
+
+# Test if cuda could be foun
+try:
+    CUDA = locate_cuda()
+except EnvironmentError:
+    CUDA = None
+
+
+# Obtain the numpy include directory.  This logic works across numpy versions.
+try:
+    numpy_include = np.get_include()
+except AttributeError:
+    numpy_include = np.get_numpy_include()
+
+
+def customize_compiler_for_nvcc(self):
+    """inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on."""
+
+    # tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1 translated
+            # from the extra_compile_args in the Extension class
+            postargs = extra_postargs['nvcc']
+        else:
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # inject our redefined _compile method into the class
+    self._compile = _compile
+
+
+# run the customize_compiler
+class custom_build_ext(build_ext):
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
+
+
+ext_modules = [
+    Extension(
+        "bbox",
+        ["bbox.pyx"],
+        extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs=[numpy_include]
+    ),
+    Extension(
+        "anchors",
+        ["anchors.pyx"],
+        extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs=[numpy_include]
+    ),
+    Extension(
+        "cpu_nms",
+        ["cpu_nms.pyx"],
+        extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs = [numpy_include]
+    ),
+]
+
+if CUDA is not None:
+    ext_modules.append(
+        Extension('gpu_nms',
+            ['nms_kernel.cu', 'gpu_nms.pyx'],
+            library_dirs=[CUDA['lib64']],
+            libraries=['cudart'],
+            language='c++',
+            runtime_library_dirs=[CUDA['lib64']],
+            # this syntax is specific to this build system
+            # we're only going to use certain compiler args with nvcc and not with
+            # gcc the implementation of this trick is in customize_compiler() below
+            extra_compile_args={'gcc': ["-Wno-unused-function"],
+                                'nvcc': ['-arch=sm_35',
+                                         '--ptxas-options=-v',
+                                         '-c',
+                                         '--compiler-options',
+                                         "'-fPIC'"]},
+            include_dirs = [numpy_include, CUDA['include']]
+        )
+    )
+else:
+    print('Skipping GPU_NMS')
+
+
+setup(
+    name='frcnn_cython',
+    ext_modules=ext_modules,
+    # inject our custom trigger
+    cmdclass={'build_ext': custom_build_ext},
+)
diff --git a/SSH/rcnn/processing/__init__.py b/SSH/rcnn/processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/SSH/rcnn/processing/bbox_regression.py b/SSH/rcnn/processing/bbox_regression.py
new file mode 100644
index 0000000..4e6f949
--- /dev/null
+++ b/SSH/rcnn/processing/bbox_regression.py
@@ -0,0 +1,120 @@
+"""
+This file has functions about generating bounding box regression targets
+"""
+
+import numpy as np
+
+from ..logger import logger
+from .bbox_transform import bbox_overlaps, bbox_transform
+from rcnn.config import config
+
+
+def compute_bbox_regression_targets(rois, overlaps, labels):
+    """
+    given rois, overlaps, gt labels, compute bounding box regression targets
+    :param rois: roidb[i]['boxes'] k * 4
+    :param overlaps: roidb[i]['max_overlaps'] k * 1
+    :param labels: roidb[i]['max_classes'] k * 1
+    :return: targets[i][class, dx, dy, dw, dh] k * 5
+    """
+    # Ensure ROIs are floats
+    rois = rois.astype(np.float, copy=False)
+
+    # Sanity check
+    if len(rois) != len(overlaps):
+        logger.warning('bbox regression: len(rois) != len(overlaps)')
+
+    # Indices of ground-truth ROIs
+    gt_inds = np.where(overlaps == 1)[0]
+    if len(gt_inds) == 0:
+        logger.warning('bbox regression: len(gt_inds) == 0')
+
+    # Indices of examples for which we try to make predictions
+    ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
+
+    # Get IoU overlap between each ex ROI and gt ROI
+    ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :])
+
+    # Find which gt ROI each ex ROI has max overlap with:
+    # this will be the ex ROI's gt target
+    gt_assignment = ex_gt_overlaps.argmax(axis=1)
+    gt_rois = rois[gt_inds[gt_assignment], :]
+    ex_rois = rois[ex_inds, :]
+
+    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
+    targets[ex_inds, 0] = labels[ex_inds]
+    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
+    return targets
+
+
+def add_bbox_regression_targets(roidb):
+    """
+    given roidb, add ['bbox_targets'] and normalize bounding box regression targets
+    :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
+    :return: means, std variances of targets
+    """
+    logger.info('bbox regression: add bounding box regression targets')
+    assert len(roidb) > 0
+    assert 'max_classes' in roidb[0]
+
+    num_images = len(roidb)
+    num_classes = roidb[0]['gt_overlaps'].shape[1]
+    for im_i in range(num_images):
+        rois = roidb[im_i]['boxes']
+        max_overlaps = roidb[im_i]['max_overlaps']
+        max_classes = roidb[im_i]['max_classes']
+        roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(rois, max_overlaps, max_classes)
+
+    if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+        # use fixed / precomputed means and stds instead of empirical values
+        means = np.tile(np.array(config.TRAIN.BBOX_MEANS), (num_classes, 1))
+        stds = np.tile(np.array(config.TRAIN.BBOX_STDS), (num_classes, 1))
+    else:
+        # compute mean, std values
+        class_counts = np.zeros((num_classes, 1)) + 1e-14
+        sums = np.zeros((num_classes, 4))
+        squared_sums = np.zeros((num_classes, 4))
+        for im_i in range(num_images):
+            targets = roidb[im_i]['bbox_targets']
+            for cls in range(1, num_classes):
+                cls_indexes = np.where(targets[:, 0] == cls)[0]
+                if cls_indexes.size > 0:
+                    class_counts[cls] += cls_indexes.size
+                    sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0)
+                    squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0)
+
+        means = sums / class_counts
+        # var(x) = E(x^2) - E(x)^2
+        stds = np.sqrt(squared_sums / class_counts - means ** 2)
+
+    # normalized targets
+    for im_i in range(num_images):
+        targets = roidb[im_i]['bbox_targets']
+        for cls in range(1, num_classes):
+            cls_indexes = np.where(targets[:, 0] == cls)[0]
+            roidb[im_i]['bbox_targets'][cls_indexes, 1:] -= means[cls, :]
+            roidb[im_i]['bbox_targets'][cls_indexes, 1:] /= stds[cls, :]
+
+    return means.ravel(), stds.ravel()
+
+
+def expand_bbox_regression_targets(bbox_targets_data, num_classes):
+    """
+    expand from 5 to 4 * num_classes; only the right class has non-zero bbox regression targets
+    :param bbox_targets_data: [k * 5]
+    :param num_classes: number of classes
+    :return: bbox target processed [k * 4 num_classes]
+    bbox_weights ! only foreground boxes have bbox regression computation!
+    """
+    classes = bbox_targets_data[:, 0]
+    bbox_targets = np.zeros((classes.size, 4 * num_classes), dtype=np.float32)
+    bbox_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
+    indexes = np.where(classes > 0)[0]
+    for index in indexes:
+        cls = classes[index]
+        start = int(4 * cls)
+        end = start + 4
+        bbox_targets[index, start:end] = bbox_targets_data[index, 1:]
+        bbox_weights[index, start:end] = config.TRAIN.BBOX_WEIGHTS
+    return bbox_targets, bbox_weights
+
diff --git a/SSH/rcnn/processing/bbox_transform.py b/SSH/rcnn/processing/bbox_transform.py
new file mode 100644
index 0000000..7a8667e
--- /dev/null
+++ b/SSH/rcnn/processing/bbox_transform.py
@@ -0,0 +1,162 @@
+import numpy as np
+from ..cython.bbox import bbox_overlaps_cython
+
+
+def bbox_overlaps(boxes, query_boxes):
+    return bbox_overlaps_cython(boxes, query_boxes)
+
+
+def bbox_overlaps_py(boxes, query_boxes):
+    """
+    determine overlaps between boxes and query_boxes
+    :param boxes: n * 4 bounding boxes
+    :param query_boxes: k * 4 bounding boxes
+    :return: overlaps: n * k overlaps
+    """
+    n_ = boxes.shape[0]
+    k_ = query_boxes.shape[0]
+    overlaps = np.zeros((n_, k_), dtype=np.float)
+    for k in range(k_):
+        query_box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(n_):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    box_area = (boxes[n, 2] - boxes[n, 0] + 1) * (boxes[n, 3] - boxes[n, 1] + 1)
+                    all_area = float(box_area + query_box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / all_area
+    return overlaps
+
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    :param boxes: [N, 4* num_classes]
+    :param im_shape: tuple of 2
+    :return: [N, 4* num_classes]
+    """
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+def nonlinear_transform(ex_rois, gt_rois):
+    """
+    compute bounding box regression targets from ex_rois to gt_rois
+    :param ex_rois: [N, 4]
+    :param gt_rois: [N, 4]
+    :return: [N, 4]
+    """
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0)
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0)
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14)
+    targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14)
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
+
+    targets = np.vstack(
+        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+    return targets
+
+
+def nonlinear_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+
+    dx = box_deltas[:, 0::4]
+    dy = box_deltas[:, 1::4]
+    dw = box_deltas[:, 2::4]
+    dh = box_deltas[:, 3::4]
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * (pred_w - 1.0)
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * (pred_h - 1.0)
+    # x2
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * (pred_w - 1.0)
+    # y2
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
+
+    return pred_boxes
+
+
+def iou_transform(ex_rois, gt_rois):
+    """ return bbox targets, IoU loss uses gt_rois as gt """
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+    return gt_rois
+
+
+def iou_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    dx1 = box_deltas[:, 0::4]
+    dy1 = box_deltas[:, 1::4]
+    dx2 = box_deltas[:, 2::4]
+    dy2 = box_deltas[:, 3::4]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0::4] = dx1 + x1[:, np.newaxis]
+    # y1
+    pred_boxes[:, 1::4] = dy1 + y1[:, np.newaxis]
+    # x2
+    pred_boxes[:, 2::4] = dx2 + x2[:, np.newaxis]
+    # y2
+    pred_boxes[:, 3::4] = dy2 + y2[:, np.newaxis]
+
+    return pred_boxes
+
+
+# define bbox_transform and bbox_pred
+bbox_transform = nonlinear_transform
+bbox_pred = nonlinear_pred
diff --git a/SSH/rcnn/processing/generate_anchor.py b/SSH/rcnn/processing/generate_anchor.py
new file mode 100644
index 0000000..16bbf2f
--- /dev/null
+++ b/SSH/rcnn/processing/generate_anchor.py
@@ -0,0 +1,96 @@
+"""
+Generate base anchors on index 0
+"""
+from __future__ import print_function
+import sys
+#from builtins import range
+import numpy as np
+from ..cython.anchors import anchors_cython
+
+
+def anchors_plane(feat_h, feat_w, stride, base_anchor):
+    return anchors_cython(feat_h, feat_w, stride, base_anchor)
+
+def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
+                     scales=2 ** np.arange(3, 6)):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
+                         for i in range(ratio_anchors.shape[0])])
+    return anchors
+
+def generate_anchors_fpn(base_size=[64,32,16,8,4], ratios=[0.5, 1, 2],
+                     scales=8):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+    anchors = []
+    _ratios = ratios.reshape( (len(base_size), -1) )
+    _scales = scales.reshape( (len(base_size), -1) )
+    for i,bs in enumerate(base_size):
+      __ratios = _ratios[i]
+      __scales = _scales[i]
+      #print('anchors_fpn', bs, __ratios, __scales, file=sys.stderr)
+      r = generate_anchors(bs, __ratios, __scales)
+      #print('anchors_fpn', r.shape, file=sys.stderr)
+      anchors.append(r)
+
+    return anchors
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
+                         y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1),
+                         y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
diff --git a/SSH/rcnn/processing/nms.py b/SSH/rcnn/processing/nms.py
new file mode 100644
index 0000000..230139c
--- /dev/null
+++ b/SSH/rcnn/processing/nms.py
@@ -0,0 +1,64 @@
+import numpy as np
+from ..cython.cpu_nms import cpu_nms
+try:
+    from ..cython.gpu_nms import gpu_nms
+except ImportError:
+    gpu_nms = None
+
+
+def py_nms_wrapper(thresh):
+    def _nms(dets):
+        return nms(dets, thresh)
+    return _nms
+
+
+def cpu_nms_wrapper(thresh):
+    def _nms(dets):
+        return cpu_nms(dets, thresh)
+    return _nms
+
+
+def gpu_nms_wrapper(thresh, device_id):
+    def _nms(dets):
+        return gpu_nms(dets, thresh, device_id)
+    if gpu_nms is not None:
+        return _nms
+    else:
+        return cpu_nms_wrapper(thresh)
+
+
+def nms(dets, thresh):
+    """
+    greedily select boxes with high confidence and overlap with current maximum <= thresh
+    rule out overlap >= thresh
+    :param dets: [[x1, y1, x2, y2 score]]
+    :param thresh: retain overlap < thresh
+    :return: indexes to keep
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
diff --git a/SSH/ssh_detector.py b/SSH/ssh_detector.py
new file mode 100644
index 0000000..cd49426
--- /dev/null
+++ b/SSH/ssh_detector.py
@@ -0,0 +1,194 @@
+from __future__ import print_function
+import sys
+import cv2
+import mxnet as mx
+from mxnet import ndarray as nd
+import numpy as np
+import numpy.random as npr
+from distutils.util import strtobool
+
+from rcnn.processing.bbox_transform import nonlinear_pred, clip_boxes
+from rcnn.processing.generate_anchor import generate_anchors_fpn, anchors_plane
+from rcnn.processing.nms import gpu_nms_wrapper
+
+
+class SSHDetector:
+  def __init__(self, prefix, epoch, ctx_id=0):
+    self.ctx_id = ctx_id
+    self.ctx = mx.gpu(self.ctx_id)
+    self.fpn_keys = []
+    fpn_stride = []
+    fpn_base_size = []
+    self._feat_stride_fpn = [32, 16, 8]
+
+    for s in self._feat_stride_fpn:
+        self.fpn_keys.append('stride%s'%s)
+        fpn_stride.append(int(s))
+        fpn_base_size.append(16)
+
+    self._scales = np.array([32,16,8,4,2,1])
+    self._ratios = np.array([1.0]*len(self._feat_stride_fpn))
+    self._anchors_fpn = dict(zip(self.fpn_keys, generate_anchors_fpn(base_size=fpn_base_size, scales=self._scales, ratios=self._ratios)))
+    self._num_anchors = dict(zip(self.fpn_keys, [anchors.shape[0] for anchors in self._anchors_fpn.values()]))
+    self._rpn_pre_nms_top_n = 1000
+    #self._rpn_post_nms_top_n = rpn_post_nms_top_n
+    #self.score_threshold = 0.05
+    self.nms_threshold = 0.3
+    self._bbox_pred = nonlinear_pred
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+    #all_layers = sym.get_internals()
+    #outs = []
+    #for stride in self._feat_stride_fpn:
+    #  _name = 'rpn_cls_score_stride%s_output' % stride
+    #  rpn_cls_score = all_layers[_name]
+    #  rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score,
+    #                                            shape=(0, 2, -1, 0),
+    #                                            name="rpn_cls_score_reshape_stride%d" % stride)
+
+    #  rpn_cls_prob = mx.symbol.SoftmaxActivation(data=rpn_cls_score_reshape,
+    #                                             mode="channel",
+    #                                             name="rpn_cls_prob_stride%d" % stride)
+    #  rpn_cls_prob_reshape = mx.symbol.Reshape(data=rpn_cls_prob,
+    #                                           shape=(0, 2 * num_anchors, -1, 0),
+    #                                           name='rpn_cls_prob_reshape_stride%d' % stride)
+    #  outs.append(rpn_cls_prob_reshape)
+    #  _name = 'rpn_bbox_pred_stride%s_output' % stride
+    #  rpn_bbox_pred = all_layers[_name]
+    #  outs.append(rpn_bbox_pred)
+    #sym = mx.sym.Group(outs)
+
+    self.model = mx.mod.Module(symbol=sym, context=self.ctx, label_names = None)
+    image_size = (640, 640)
+    self.model.bind(data_shapes=[('data', (1, 3, image_size[0], image_size[1]))], for_training=False)
+    self.model.set_params(arg_params, aux_params)
+    self.nms = gpu_nms_wrapper(self.nms_threshold, self.ctx_id)
+    pass
+
+  def detect(self,img, threshold=0.05):
+    image_size = (img.shape[0], img.shape[1])
+    #self.model.bind(data_shapes=[('data', (1, 3, image_size[0], image_size[1]))], for_training=False)
+    im_info = [image_size[0], image_size[1], 1.0]
+    data = nd.zeros( (1 ,3, image_size[0], image_size[1]) )
+    nimg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    nimg = np.transpose(nimg, (2,0,1))
+    nimg = nd.array(nimg)
+    data[0] = nimg
+    db = mx.io.DataBatch(data=(data,))
+    self.model.forward(db, is_train=False)
+    net_out = self.model.get_outputs()
+
+
+    pre_nms_topN = self._rpn_pre_nms_top_n
+    #post_nms_topN = self._rpn_post_nms_top_n
+    #min_size_dict = self._rpn_min_size_fpn
+
+    proposals_list = []
+    scores_list = []
+    idx = 0
+    for s in self._feat_stride_fpn:
+        _key = 'stride%s'%s
+        stride = int(s)
+        scores = net_out[idx].asnumpy()
+        #print(scores.shape)
+        idx+=1
+        #print('scores',stride, scores.shape, file=sys.stderr)
+        scores = scores[:, self._num_anchors['stride%s'%s]:, :, :]
+        bbox_deltas = net_out[idx].asnumpy()
+        idx+=1
+
+        #if DEBUG:
+        #    print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
+        #    print 'scale: {}'.format(im_info[2])
+
+        _height, _width = int(im_info[0] / stride), int(im_info[1] / stride)
+        height, width = bbox_deltas.shape[2], bbox_deltas.shape[3]
+
+        A = self._num_anchors['stride%s'%s]
+        K = height * width
+
+        anchors = anchors_plane(height, width, stride, self._anchors_fpn['stride%s'%s].astype(np.float32))
+        print((height, width), (_height, _width), anchors.shape, bbox_deltas.shape, scores.shape, file=sys.stderr)
+        anchors = anchors.reshape((K * A, 4))
+
+        #print('pre', bbox_deltas.shape, height, width)
+        bbox_deltas = self._clip_pad(bbox_deltas, (height, width))
+        #print('after', bbox_deltas.shape, height, width)
+        bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
+
+        scores = self._clip_pad(scores, (height, width))
+        scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+
+        #print(anchors.shape, bbox_deltas.shape, A, K, file=sys.stderr)
+        proposals = self._bbox_pred(anchors, bbox_deltas)
+        #proposals = anchors
+
+        proposals = clip_boxes(proposals, im_info[:2])
+
+        #keep = self._filter_boxes(proposals, min_size_dict['stride%s'%s] * im_info[2])
+        #proposals = proposals[keep, :]
+        #scores = scores[keep]
+        #print('333', proposals.shape)
+
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        if pre_nms_topN > 0:
+            order = order[:pre_nms_topN]
+        proposals = proposals[order, :]
+        scores = scores[order]
+
+        proposals_list.append(proposals)
+        scores_list.append(scores)
+
+    proposals = np.vstack(proposals_list)
+    scores = np.vstack(scores_list)
+    scores_ravel = scores.ravel()
+    order = scores_ravel.argsort()[::-1]
+    #if config.TEST.SCORE_THRESH>0.0:
+    #  _count = np.sum(scores_ravel>config.TEST.SCORE_THRESH)
+    #  order = order[:_count]
+    #if pre_nms_topN > 0:
+    #    order = order[:pre_nms_topN]
+    proposals = proposals[order, :]
+    scores = scores[order]
+
+    det = np.hstack((proposals, scores)).astype(np.float32)
+
+    #if np.shape(det)[0] == 0:
+    #    print("Something wrong with the input image(resolution is too low?), generate fake proposals for it.")
+    #    proposals = np.array([[1.0, 1.0, 2.0, 2.0]]*post_nms_topN, dtype=np.float32)
+    #    scores = np.array([[0.9]]*post_nms_topN, dtype=np.float32)
+    #    det = np.array([[1.0, 1.0, 2.0, 2.0, 0.9]]*post_nms_topN, dtype=np.float32)
+
+    
+    if self.nms_threshold<1.0:
+      keep = self.nms(det)
+      det = det[keep, :]
+    if threshold>0.0:
+      keep = np.where(det[:, 4] >= threshold)[0]
+      det = det[keep, :]
+    return det
+
+  @staticmethod
+  def _filter_boxes(boxes, min_size):
+      """ Remove all boxes with any side smaller than min_size """
+      ws = boxes[:, 2] - boxes[:, 0] + 1
+      hs = boxes[:, 3] - boxes[:, 1] + 1
+      keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+      return keep
+
+  @staticmethod
+  def _clip_pad(tensor, pad_shape):
+      """
+      Clip boxes of the pad area.
+      :param tensor: [n, c, H, W]
+      :param pad_shape: [h, w]
+      :return: [n, c, h, w]
+      """
+      H, W = tensor.shape[2:]
+      h, w = pad_shape
+
+      if h < H or w < W:
+        tensor = tensor[:, :, :h, :w].copy()
+
+      return tensor
+
diff --git a/SSH/t2.jpg b/SSH/t2.jpg
new file mode 100644
index 0000000..dcca930
Binary files /dev/null and b/SSH/t2.jpg differ
diff --git a/SSH/test.py b/SSH/test.py
new file mode 100644
index 0000000..4c3c755
--- /dev/null
+++ b/SSH/test.py
@@ -0,0 +1,29 @@
+import cv2
+import sys
+import numpy as np
+import datetime
+#sys.path.append('.')
+from ssh_detector import SSHDetector
+
+long_max = 1200
+t = 2
+
+
+f = 't2.jpg'
+if len(sys.argv)>1:
+  f = sys.argv[1]
+img = cv2.imread(f)
+print(img.shape)
+if img.shape[0]>long_max or img.shape[1]>long_max:
+  scale = float(long_max) / max(img.shape[0], img.shape[1])
+  img = cv2.resize(img, (0,0), fx=scale, fy=scale)
+  print('resize to', img.shape)
+detector = SSHDetector('./model/e2ef', 0)
+for i in xrange(t-1): #warmup
+  faces = detector.detect(img)
+timea = datetime.datetime.now()
+faces = detector.detect(img)
+timeb = datetime.datetime.now()
+diff = timeb - timea
+print('detection uses', diff.total_seconds(), 'seconds')
+print('find', faces.shape[0], 'faces')