diff --git a/dmlc-core b/dmlc-core
index ac983092ee3b..3ffea8694adf 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit ac983092ee3b339f76a2d7e7c3b846570218200d
+Subproject commit 3ffea8694adf9c0363f9abbf162dc0e4a45b22c5
diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst
index fd106be7c3d1..8e0be9d0b2c3 100644
--- a/doc/jvm/xgboost4j_spark_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_tutorial.rst
@@ -205,6 +205,18 @@ Training with Evaluation Sets
 
 You can also monitor the performance of the model during training with multiple evaluation datasets. By specifying ``eval_sets`` or call ``setEvalSets`` over a XGBoostClassifier or XGBoostRegressor, you can pass in multiple evaluation datasets typed as a Map from String to DataFrame.
 
+Training with Custom Evaluation Metrics
+----------------
+With XGBoost4j (including XGBoost4J-Spark), users are able to implement their own custom evaluation metrics and synchronize the values in the distributed training setting. To implement a custom evaluation metric, users should implement the interface ``ml.dmlc.xgboost4j.java.IEvalElementWiseDistributed`` (for binary classification and regression), ``ml.dmlc.xgboost4j.java.IEvalMultiClassesDistributed`` (for multi classification) and ``ml.dmlc.xgboost4j.java.IEvalRankListDistributed`` (for ranking).
+
+* ``ml.dmlc.xgboost4j.java.IEvalElementWiseDistributed``: users are supposed to implement ``float evalRow(float label, float pred);`` which calculates the metric for a single sample given the prediction and label, as well as ``float getFinal(float errorSum, float weightSum);`` which performs the final transformation over the sum of error and weights of samples.
+
+* ``ml.dmlc.xgboost4j.java.IEvalMultiClassesDistributed``: the methods to be implemented by the users are similar to ``ml.dmlc.xgboost4j.java.IEvalElementWiseDistributed`` except that the single row metric calculating method is ``float evalRow(int label, float pred, int numClasses);``
+
+* ``ml.dmlc.xgboost4j.java.IEvalRankListDistributed``: users are to implement ``float evalMetric(float[] preds, int[] labels);`` which gives the predictions and labels for instances in the same group;
+
+By default, these interfaces do not support being used in single machine evaluation, users can change this by re-implement ``float eval(float[][] predicts, DMatrix dmat)`` method.
+
 Prediction
 ==========
 
diff --git a/include/xgboost/build_config.h b/include/xgboost/build_config.h
index 6d364a6ff081..195d92905162 100644
--- a/include/xgboost/build_config.h
+++ b/include/xgboost/build_config.h
@@ -7,6 +7,7 @@
 
 // These check are for Makefile.
 #if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
+
 /* default logic for software pre-fetching */
 #if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER)
 // Enable _mm_prefetch for Intel compiler and MSVC+x86
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 3328aba88316..03e6d3ee19b6 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -17,6 +17,9 @@
 #include <stdint.h>
 #endif  // __cplusplus
 
+// XGBoost C API will include APIs in Rabit C API
+#include <rabit/c_api.h>
+
 #if defined(_MSC_VER) || defined(_WIN32)
 #define XGB_DLL XGB_EXTERN_C __declspec(dllexport)
 #else
@@ -565,4 +568,6 @@ XGB_DLL int XGBoosterLoadRabitCheckpoint(
  */
 XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle);
 
+XGB_DLL void XGBoosterRegisterNewMetrics(BoosterHandle handle, const char* metrics_name);
+
 #endif  // XGBOOST_C_API_H_
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 187d27a2d6ab..15b8123cc148 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -16,7 +16,7 @@
 #include <vector>
 #include "./base.h"
 #include "./gbm.h"
-#include "./metric.h"
+#include "xgboost/metric/metric.h"
 #include "./objective.h"
 
 namespace xgboost {
@@ -186,6 +186,9 @@ class Learner : public rabit::Serializable {
    */
   virtual const std::map<std::string, std::string>& GetConfigurationArguments() const = 0;
 
+  /*! \brief The evaluation metrics used to evaluate the model. */
+  std::vector<std::unique_ptr<Metric> > metrics_;
+
  protected:
   /*! \brief internal base score of the model */
   bst_float base_score_;
@@ -193,8 +196,6 @@ class Learner : public rabit::Serializable {
   std::unique_ptr<ObjFunction> obj_;
   /*! \brief The gradient booster used by the model*/
   std::unique_ptr<GradientBooster> gbm_;
-  /*! \brief The evaluation metrics used to evaluate the model. */
-  std::vector<std::unique_ptr<Metric> > metrics_;
 };
 
 // implementation of inline functions.
diff --git a/include/xgboost/metric/elementwise_metric.h b/include/xgboost/metric/elementwise_metric.h
new file mode 100644
index 000000000000..5f67e8db1794
--- /dev/null
+++ b/include/xgboost/metric/elementwise_metric.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2015-2019 by Contributors
+ */
+
+#ifndef XGBOOST_METRIC_ELEMENTWISE_METRIC_H_
+#define XGBOOST_METRIC_ELEMENTWISE_METRIC_H_
+
+#include <xgboost/metric/metric.h>
+#include <xgboost/metric/metric_common.h>
+
+#include <functional>
+#include <utility>
+#include <string>
+#include <vector>
+
+#include "../../../src/common/common.h"
+
+#if defined(XGBOOST_USE_CUDA)
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>  // thrust::plus<>
+
+#include "../../../src/common/device_helpers.cuh"
+#endif  // XGBOOST_USE_CUDA
+
+/*!
+ * \brief base class of element-wise evaluation
+ * \tparam Derived the name of subclass
+ */
+namespace xgboost {
+namespace metric {
+
+template<typename EvalRow>
+class ElementWiseMetricsReduction {
+ public:
+  explicit ElementWiseMetricsReduction(EvalRow policy) :
+          policy_(std::move(policy)) {}
+
+  PackedReduceResult CpuReduceMetrics(
+          const HostDeviceVector <bst_float> &weights,
+          const HostDeviceVector <bst_float> &labels,
+          const HostDeviceVector <bst_float> &preds) const {
+    size_t ndata = labels.Size();
+
+    const auto &h_labels = labels.HostVector();
+    const auto &h_weights = weights.HostVector();
+    const auto &h_preds = preds.HostVector();
+
+    bst_float residue_sum = 0;
+    bst_float weights_sum = 0;
+
+#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
+    for (omp_ulong i = 0; i < ndata; ++i) {
+     const bst_float wt = h_weights.size() > 0 ? h_weights[i] : 1.0f;
+     residue_sum += policy_.EvalRow(h_labels[i], h_preds[i]) * wt;
+     weights_sum += wt;
+    }
+    PackedReduceResult res{residue_sum, weights_sum};
+    return res;
+  }
+
+#if defined(XGBOOST_USE_CUDA)
+
+  PackedReduceResult DeviceReduceMetrics(
+     GPUSet::GpuIdType device_id,
+     size_t device_index,
+     const HostDeviceVector<bst_float>& weights,
+     const HostDeviceVector<bst_float>& labels,
+     const HostDeviceVector<bst_float>& preds) {
+    size_t n_data = preds.DeviceSize(device_id);
+
+    thrust::counting_iterator<size_t> begin(0);
+    thrust::counting_iterator<size_t> end = begin + n_data;
+
+    auto s_label = labels.DeviceSpan(device_id);
+    auto s_preds = preds.DeviceSpan(device_id);
+    auto s_weights = weights.DeviceSpan(device_id);
+
+    bool const is_null_weight = weights.Size() == 0;
+
+    auto d_policy = policy_;
+
+    PackedReduceResult result = thrust::transform_reduce(
+    thrust::cuda::par(allocators_.at(device_index)),
+    begin, end,
+    [=] XGBOOST_DEVICE(size_t idx) {
+    bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
+
+    bst_float residue = d_policy.EvalRow(s_label[idx], s_preds[idx]);
+    residue *= weight;
+    return PackedReduceResult{ residue, weight };
+    },
+    PackedReduceResult(),
+    thrust::plus<PackedReduceResult>());
+
+    return result;
+  }
+
+#endif  // XGBOOST_USE_CUDA
+
+  PackedReduceResult Reduce(
+      GPUSet devices,
+      const HostDeviceVector <bst_float> &weights,
+      const HostDeviceVector <bst_float> &labels,
+      const HostDeviceVector <bst_float> &preds) {
+    PackedReduceResult result;
+
+    if (devices.IsEmpty()) {
+     result = CpuReduceMetrics(weights, labels, preds);
+    }
+#if defined(XGBOOST_USE_CUDA)
+    else {  // NOLINT
+      if (allocators_.size() != devices.Size()) {
+       allocators_.clear();
+       allocators_.resize(devices.Size());
+      }
+      preds.Reshard(devices);
+      labels.Reshard(devices);
+      weights.Reshard(devices);
+      std::vector<PackedReduceResult> res_per_device(devices.Size());
+
+#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
+      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
+        dh::safe_cuda(cudaSetDevice(id));
+        size_t index = devices.Index(id);
+        res_per_device.at(index) = DeviceReduceMetrics(id, index, weights, labels, preds);
+      }
+
+      for (auto const& res : res_per_device) {
+        result += res;
+      }
+    }
+#endif  // defined(XGBOOST_USE_CUDA)
+    return result;
+  }
+
+ private:
+  EvalRow policy_;
+#if defined(XGBOOST_USE_CUDA)
+  std::vector<dh::CubMemory> allocators_;
+#endif  // defined(XGBOOST_USE_CUDA)
+};
+
+template<typename Policy>
+struct EvalEWiseBase : public Metric {
+  EvalEWiseBase() : policy_{}, reducer_{policy_} {}
+
+  explicit EvalEWiseBase(Policy &policy) : policy_{policy}, reducer_{policy_} {}
+
+  explicit EvalEWiseBase(char const *policy_param) :
+          policy_{policy_param}, reducer_{policy_} {}
+
+  void Configure(
+          const std::vector<std::pair<std::string, std::string>> &args) override {
+    param_.InitAllowUnknown(args);
+  }
+
+  bst_float Eval(const HostDeviceVector <bst_float> &preds,
+                 const MetaInfo &info,
+                 bool distributed) override {
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.Size(), info.labels_.Size())
+      << "label and prediction size not match, "
+      << "hint: use merror or mlogloss for multi-class classification";
+    const auto ndata = static_cast<omp_ulong>(info.labels_.Size());
+    // Dealing with ndata < n_gpus.
+    GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata);
+
+    auto result =
+            reducer_.Reduce(devices, info.weights_, info.labels_, preds);
+
+    double dat[2]{result.Residue(), result.Weights()};
+    if (distributed) {
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+    }
+    return Policy::GetFinal(dat[0], dat[1]);
+  }
+
+  const char *Name() const override {
+    return policy_.Name();
+  }
+
+ private:
+  Policy policy_;
+
+  MetricParam param_;
+
+  ElementWiseMetricsReduction<Policy> reducer_;
+};
+
+}  // namespace metric
+}  // namespace xgboost
+#endif  // XGBOOST_METRIC_ELEMENTWISE_METRIC_H_
diff --git a/include/xgboost/metric.h b/include/xgboost/metric/metric.h
similarity index 91%
rename from include/xgboost/metric.h
rename to include/xgboost/metric/metric.h
index 56ecebfbf8df..79ef4d026f03 100644
--- a/include/xgboost/metric.h
+++ b/include/xgboost/metric/metric.h
@@ -1,21 +1,21 @@
 /*!
- * Copyright 2014 by Contributors
+ * Copyright 2014-2019 by Contributors
  * \file metric.h
  * \brief interface of evaluation metric function supported in xgboost.
- * \author Tianqi Chen, Kailong Chen
  */
-#ifndef XGBOOST_METRIC_H_
-#define XGBOOST_METRIC_H_
+#ifndef XGBOOST_METRIC_METRIC_H_
+#define XGBOOST_METRIC_METRIC_H_
 
 #include <dmlc/registry.h>
+#include <xgboost/data.h>
+#include <xgboost/base.h>
+
 #include <vector>
 #include <string>
 #include <functional>
 #include <utility>
 
-#include "./data.h"
-#include "./base.h"
-#include "../../src/common/host_device_vector.h"
+#include "../../../src/common/host_device_vector.h"
 
 namespace xgboost {
 /*!
@@ -93,4 +93,4 @@ struct MetricReg
   ::xgboost::MetricReg&  __make_ ## MetricReg ## _ ## UniqueId ## __ =  \
       ::dmlc::Registry< ::xgboost::MetricReg>::Get()->__REGISTER__(Name)
 }  // namespace xgboost
-#endif  // XGBOOST_METRIC_H_
+#endif  // XGBOOST_METRIC_METRIC_H_
diff --git a/src/metric/metric_common.h b/include/xgboost/metric/metric_common.h
similarity index 94%
rename from src/metric/metric_common.h
rename to include/xgboost/metric/metric_common.h
index 293d0a235926..756fb4d8941e 100644
--- a/src/metric/metric_common.h
+++ b/include/xgboost/metric/metric_common.h
@@ -1,12 +1,11 @@
 /*!
- * Copyright 2018-2019 by Contributors
- * \file metric_param.cc
+ * Copyright 2019 by Contributors
  */
 #ifndef XGBOOST_METRIC_METRIC_COMMON_H_
 #define XGBOOST_METRIC_METRIC_COMMON_H_
 
 #include <dmlc/parameter.h>
-#include "../common/common.h"
+#include "../../../src/common/common.h"
 
 namespace xgboost {
 namespace metric {
@@ -39,6 +38,7 @@ class PackedReduceResult {
     return PackedReduceResult{residue_sum_ + other.residue_sum_,
                               weights_sum_ + other.weights_sum_};
   }
+
   PackedReduceResult &operator+=(PackedReduceResult const &other) {
     this->residue_sum_ += other.residue_sum_;
     this->weights_sum_ += other.weights_sum_;
diff --git a/include/xgboost/metric/multiclass_metric.h b/include/xgboost/metric/multiclass_metric.h
new file mode 100644
index 000000000000..b847e56ff277
--- /dev/null
+++ b/include/xgboost/metric/multiclass_metric.h
@@ -0,0 +1,226 @@
+/*!
+ * Copyright 2019 by Contributors
+ */
+
+#ifndef XGBOOST_METRIC_MULTICLASS_METRIC_H_
+#define XGBOOST_METRIC_MULTICLASS_METRIC_H_
+
+#include <xgboost/metric/metric.h>
+#include <xgboost/metric/metric_common.h>
+
+#include <functional>
+#include <string>
+#include <vector>
+#include <utility>
+
+#if defined(XGBOOST_USE_CUDA)
+#include <thrust/execution_policy.h>  // thrust::cuda::par
+#include <thrust/functional.h>        // thrust::plus<>
+#include <thrust/transform_reduce.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include "../../../src/common/device_helpers.cuh"
+#endif  // XGBOOST_USE_CUDA
+
+namespace xgboost {
+namespace metric {
+
+template <typename EvalRowPolicy>
+class MultiClassMetricsReduction {
+  void CheckLabelError(int32_t label_error, size_t n_class) const {
+    CHECK(label_error >= 0 && label_error < static_cast<int32_t>(n_class))
+        << "MultiClassEvaluation: label must be in [0, num_class),"
+        << " num_class=" << n_class << " but found " << label_error << " in label";
+  }
+
+ public:
+  MultiClassMetricsReduction() = default;
+
+  PackedReduceResult CpuReduceMetrics(
+      const HostDeviceVector<bst_float>& weights,
+      const HostDeviceVector<bst_float>& labels,
+      const HostDeviceVector<bst_float>& preds,
+      const size_t n_class) const {
+    size_t ndata = labels.Size();
+
+    const auto& h_labels = labels.HostVector();
+    const auto& h_weights = weights.HostVector();
+    const auto& h_preds = preds.HostVector();
+
+    bst_float residue_sum = 0;
+    bst_float weights_sum = 0;
+    int label_error = 0;
+    bool const is_null_weight = weights.Size() == 0;
+
+#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
+    for (omp_ulong idx = 0; idx < ndata; ++idx) {
+        bst_float weight = is_null_weight ? 1.0f : h_weights[idx];
+        auto label = static_cast<int>(h_labels[idx]);
+        if (label >= 0 && label < static_cast<int>(n_class)) {
+            residue_sum += EvalRowPolicy::EvalRow(
+                    label, h_preds.data() + idx * n_class, n_class) * weight;
+            weights_sum += weight;
+        } else {
+            label_error = label;
+        }
+    }
+    CheckLabelError(label_error, n_class);
+    PackedReduceResult res { residue_sum, weights_sum };
+
+    return res;
+  }
+
+#if defined(XGBOOST_USE_CUDA)
+
+  PackedReduceResult DeviceReduceMetrics(
+      GPUSet::GpuIdType device_id,
+      size_t device_index,
+      const HostDeviceVector<bst_float>& weights,
+      const HostDeviceVector<bst_float>& labels,
+      const HostDeviceVector<bst_float>& preds,
+      const size_t n_class) {
+    size_t n_data = labels.DeviceSize(device_id);
+
+    thrust::counting_iterator<size_t> begin(0);
+    thrust::counting_iterator<size_t> end = begin + n_data;
+
+    auto s_labels = labels.DeviceSpan(device_id);
+    auto s_preds = preds.DeviceSpan(device_id);
+    auto s_weights = weights.DeviceSpan(device_id);
+
+    bool const is_null_weight = weights.Size() == 0;
+    auto s_label_error = label_error_.GetSpan<int32_t>(1);
+    s_label_error[0] = 0;
+
+    PackedReduceResult result = thrust::transform_reduce(
+        thrust::cuda::par(allocators_.at(device_index)),
+        begin, end,
+        [=] XGBOOST_DEVICE(size_t idx) {
+          bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
+          bst_float residue = 0;
+          auto label = static_cast<int>(s_labels[idx]);
+          if (label >= 0 && label < static_cast<int32_t>(n_class)) {
+            residue = EvalRowPolicy::EvalRow(
+                label, &s_preds[idx * n_class], n_class) * weight;
+          } else {
+            s_label_error[0] = label;
+          }
+          return PackedReduceResult{ residue, weight };
+        },
+        PackedReduceResult(),
+        thrust::plus<PackedReduceResult>());
+    CheckLabelError(s_label_error[0], n_class);
+
+    return result;
+  }
+
+#endif  // XGBOOST_USE_CUDA
+
+    PackedReduceResult Reduce(
+        GPUSet devices,
+        size_t n_class,
+        const HostDeviceVector<bst_float>& weights,
+        const HostDeviceVector<bst_float>& labels,
+        const HostDeviceVector<bst_float>& preds) {
+      PackedReduceResult result;
+
+      if (devices.IsEmpty()) {
+          result = CpuReduceMetrics(weights, labels, preds, n_class);
+      }
+  #if defined(XGBOOST_USE_CUDA)
+      else {  // NOLINT
+        if (allocators_.size() != devices.Size()) {
+          allocators_.clear();
+          allocators_.resize(devices.Size());
+        }
+        preds.Reshard(GPUDistribution::Granular(devices, n_class));
+        labels.Reshard(devices);
+        weights.Reshard(devices);
+        std::vector<PackedReduceResult> res_per_device(devices.Size());
+
+  #pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
+        for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
+          dh::safe_cuda(cudaSetDevice(id));
+          size_t index = devices.Index(id);
+          res_per_device.at(index) =
+              DeviceReduceMetrics(id, index, weights, labels, preds, n_class);
+        }
+
+        for (auto const& res : res_per_device) {
+          result += res;
+        }
+      }
+  #endif  // defined(XGBOOST_USE_CUDA)
+        return result;
+    }
+
+ private:
+  #if defined(XGBOOST_USE_CUDA)
+    dh::PinnedMemory label_error_;
+    std::vector<dh::CubMemory> allocators_;
+  #endif  // defined(XGBOOST_USE_CUDA)
+};
+
+
+/*!
+* \brief base class of multi-class evaluation
+* \tparam Derived the name of subclass
+*/
+template<typename Derived>
+struct EvalMClassBase : public Metric {
+  void Configure(
+      const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.InitAllowUnknown(args);
+  }
+
+  bst_float Eval(const HostDeviceVector<bst_float> &preds,
+                 const MetaInfo &info,
+                 bool distributed) override {
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK(preds.Size() % info.labels_.Size() == 0)
+        << "label and prediction size not match";
+    const size_t nclass = preds.Size() / info.labels_.Size();
+    CHECK_GE(nclass, 1U)
+        << "mlogloss and merror are only used for multi-class classification,"
+        << " use logloss for binary classification";
+    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
+
+    GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata);
+    auto result = reducer_.Reduce(devices, nclass, info.weights_, info.labels_, preds);
+    double dat[2] { result.Residue(), result.Weights() };
+
+    if (distributed) {
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+    }
+    return Derived::GetFinal(dat[0], dat[1]);
+  }
+  /*!
+   * \brief to be implemented by subclass,
+   *   get evaluation result from one row
+   * \param label label of current instance
+   * \param pred prediction value of current instance
+   * \param nclass number of class in the prediction
+   */
+  XGBOOST_DEVICE static bst_float EvalRow(int label,
+                                          const bst_float *pred,
+                                          size_t nclass);
+  /*!
+   * \brief to be overridden by subclass, final transformation
+   * \param esum the sum statistics returned by EvalRow
+   * \param wsum sum of weight
+   */
+  inline static bst_float GetFinal(bst_float esum, bst_float wsum) {
+    return esum / wsum;
+  }
+
+ private:
+  MultiClassMetricsReduction<Derived> reducer_;
+  MetricParam param_;
+  // used to store error message
+  const char *error_msg_;
+};
+
+}  // namespace metric
+}  // namespace xgboost
+
+#endif  // XGBOOST_METRIC_MULTICLASS_METRIC_H_
diff --git a/include/xgboost/metric/ranking_metric.h b/include/xgboost/metric/ranking_metric.h
new file mode 100644
index 000000000000..bc77cb264da6
--- /dev/null
+++ b/include/xgboost/metric/ranking_metric.h
@@ -0,0 +1,109 @@
+/*
+  Copyright (c) 2019 by Contributors
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#ifndef XGBOOST_METRIC_RANKING_METRIC_H_
+#define XGBOOST_METRIC_RANKING_METRIC_H_
+
+#include <xgboost/metric/metric.h>
+#include <xgboost/metric/metric_common.h>
+
+#include <limits>
+#include <string>
+
+namespace xgboost {
+namespace metric {
+/*! \brief Evaluate rank list */
+struct EvalRankList : public Metric {
+ public:
+  bst_float Eval(const HostDeviceVector<bst_float> &preds,
+                const MetaInfo &info,
+                bool distributed) override {
+    CHECK_EQ(preds.Size(), info.labels_.Size())
+      << "label size predict size not match";
+    // quick consistency when group is not available
+    std::vector<unsigned> tgptr(2, 0);
+    tgptr[1] = static_cast<unsigned>(preds.Size());
+    const std::vector<unsigned> &gptr = info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_;
+    CHECK_NE(gptr.size(), 0U) << "must specify group when constructing rank file";
+    CHECK_EQ(gptr.back(), preds.Size())
+     << "EvalRanklist: group structure must match number of prediction";
+    const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
+    // sum statistics
+    double sum_metric = 0.0f;
+    const auto &labels = info.labels_.HostVector();
+
+    const std::vector<bst_float> &h_preds = preds.HostVector();
+    #pragma omp parallel reduction(+:sum_metric)
+    {
+     // each thread takes a local rec
+     std::vector<std::pair<bst_float, unsigned> > rec;
+     #pragma omp for schedule(static)
+     for (bst_omp_uint k = 0; k < ngroup; ++k) {
+       rec.clear();
+       for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
+         rec.emplace_back(h_preds[j], static_cast<int>(labels[j]));
+       }
+       sum_metric += this->EvalMetric(rec);
+     }
+    }
+    if (distributed) {
+      bst_float dat[2];
+      dat[0] = static_cast<bst_float>(sum_metric);
+      dat[1] = static_cast<bst_float>(ngroup);
+      // approximately estimate the metric using mean
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+      return dat[0] / dat[1];
+    } else {
+      return static_cast<bst_float>(sum_metric) / ngroup;
+    }
+  }
+
+  const char *Name() const override {
+    return name_.c_str();
+  }
+
+ protected:
+  explicit EvalRankList(const char *name, const char *param) {
+    using namespace std;  // NOLINT(*)
+    minus_ = false;
+    if (param != nullptr) {
+      std::ostringstream os;
+      os << name << '@' << param;
+      name_ = os.str();
+      if (sscanf(param, "%u[-]?", &topn_) != 1) {
+        topn_ = std::numeric_limits<unsigned>::max();
+      }
+      if (param[strlen(param) - 1] == '-') {
+        minus_ = true;
+      }
+    } else {
+      name_ = name;
+      topn_ = std::numeric_limits<unsigned>::max();
+    }
+  }
+
+  /*! \return evaluation metric, given the pair_sort record, (pred,label) */
+  virtual bst_float
+  EvalMetric(std::vector<std::pair<bst_float, unsigned> > &pair_sort) const = 0; // NOLINT(*)
+
+ protected:
+  unsigned topn_;
+  std::string name_;
+  bool minus_;
+};
+
+}  // namespace metric
+}  // namespace xgboost
+
+#endif  // XGBOOST_METRIC_RANKING_METRIC_H_
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 65f0ef30fe2a..de8abaa0a90f 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -18,13 +18,11 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import java.io.File
 import java.nio.file.Files
-import java.util.Properties
 
-import scala.collection.mutable.ListBuffer
 import scala.collection.{AbstractIterator, mutable}
 import scala.util.Random
 
-import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker}
+import ml.dmlc.xgboost4j.java.{IEvalElementWiseDistributed, IEvalMultiClassesDistributed, IEvalRankListDistributed, IEvaluation, IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker}
 import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
 import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
@@ -139,7 +137,7 @@ object XGBoost extends Serializable {
       rabitEnv: java.util.Map[String, String],
       round: Int,
       obj: ObjectiveTrait,
-      eval: EvalTrait,
+      eval: IEvaluation,
       prevBooster: Booster): Iterator[(Booster, Map[String, Array[Float]])] = {
 
     // to workaround the empty partitions in training dataset,
@@ -173,7 +171,7 @@ object XGBoost extends Serializable {
     }
   }
 
-  private def overrideParamsAccordingToTaskCPUs(
+  private def overrideParams(
       params: Map[String, Any],
       sc: SparkContext): Map[String, Any] = {
     val coresPerTask = sc.getConf.getInt("spark.task.cpus", 1)
@@ -283,7 +281,7 @@ object XGBoost extends Serializable {
     val round = params("num_round").asInstanceOf[Int]
     val useExternalMemory = params("use_external_memory").asInstanceOf[Boolean]
     val obj = params.getOrElse("custom_obj", null).asInstanceOf[ObjectiveTrait]
-    val eval = params.getOrElse("custom_eval", null).asInstanceOf[EvalTrait]
+    val eval = params.getOrElse("custom_eval", null).asInstanceOf[IEvaluation]
     val missing = params.getOrElse("missing", Float.NaN).asInstanceOf[Float]
     validateSparkSslConf(sparkContext)
 
@@ -426,7 +424,7 @@ object XGBoost extends Serializable {
         checkpointRound: Int =>
           val tracker = startTracker(nWorkers, trackerConf)
           try {
-            val overriddenParams = overrideParamsAccordingToTaskCPUs(params, sc)
+            val overriddenParams = overrideParams(params, sc)
             val parallelismTracker = new SparkParallelismTracker(sc, timeoutRequestWorkers,
               nWorkers)
             val rabitEnv = tracker.getWorkerEnvs
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CustomizedEvalSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CustomizedEvalSuite.scala
new file mode 100644
index 000000000000..49205bcc49a6
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CustomizedEvalSuite.scala
@@ -0,0 +1,72 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import org.scalatest.FunSuite
+
+class CustomizedEvalSuite extends FunSuite with PerTest {
+
+  test("(regression) distributed training with customized evaluation metrics") {
+    val paramMap = List("eta" -> "1", "max_depth" -> "6",
+      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
+      "custom_eval" -> new DistributedEvalErrorElementWise).toMap
+    val trainingDF = buildDataFrame(Regression.train, numWorkers)
+    val trainingCount = trainingDF.count()
+    val xgbModel = new XGBoostRegressor(paramMap).fit(trainingDF)
+    // DistributedEvalError returns 1.0f in evalRow and sum of error in getFinal
+    xgbModel.summary.trainObjectiveHistory.foreach(metricsInRound =>
+      assert(metricsInRound === trainingCount))
+  }
+
+  test("(binary classification) distributed training with customized evaluation metrics") {
+    val paramMap = List("eta" -> "1", "max_depth" -> "6",
+      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
+      "custom_eval" -> new DistributedEvalErrorElementWise).toMap
+    val trainingDF = buildDataFrame(Classification.train, numWorkers)
+    val trainingCount = trainingDF.count()
+    val xgbModel = new XGBoostClassifier(paramMap).fit(trainingDF)
+    // DistributedEvalError returns 1.0f in evalRow and sum of error in getFinal
+    xgbModel.summary.trainObjectiveHistory.foreach(metricsInRound =>
+      assert(metricsInRound === trainingCount))
+  }
+
+  test("(multi classes classification) distributed training with" +
+    " customized evaluation metrics") {
+    val paramMap = List("eta" -> "1", "max_depth" -> "6",
+      "objective" -> "multi:softmax", "num_class" -> 6,
+      "num_round" -> 5, "num_workers" -> numWorkers,
+      "custom_eval" -> new DistributedEvalErrorMultiClasses).toMap
+    val trainingDF = buildDataFrame(MultiClassification.train, numWorkers)
+    val trainingCount = trainingDF.count()
+    val xgbModel = new XGBoostClassifier(paramMap).fit(trainingDF)
+    // DistributedEvalError returns 1.0f + num_classes in evalRow and sum of error in getFinal
+    xgbModel.summary.trainObjectiveHistory.foreach(metricsInRound =>
+      assert(metricsInRound === trainingCount * (1 + 6)))
+  }
+
+  test("(ranking) distributed training with" +
+    " customized evaluation metrics") {
+    val paramMap = List("eta" -> "1", "max_depth" -> "6",
+      "objective" -> "rank:pairwise", "num_round" -> 5, "num_workers" -> numWorkers,
+      "custom_eval" -> new DistributedEvalErrorRankList).toMap
+    val trainingDF = buildDataFrame(Ranking.train, numWorkers)
+    val trainingCount = trainingDF.count()
+    val xgbModel = new XGBoostRegressor(paramMap).fit(trainingDF)
+    xgbModel.summary.trainObjectiveHistory.foreach(metricsInRound =>
+      assert(metricsInRound === 0.0f))
+  }
+}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DistributedEvalError.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DistributedEvalError.scala
new file mode 100644
index 000000000000..55eddd787ff7
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DistributedEvalError.scala
@@ -0,0 +1,66 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import ml.dmlc.xgboost4j.java.{IEvalElementWiseDistributed, IEvalMultiClassesDistributed, IEvalRankListDistributed}
+
+class DistributedEvalErrorElementWise extends IEvalElementWiseDistributed {
+
+  /**
+   * calculate the metrics for a single row given its label and prediction
+   */
+  def evalRow(label: Float, pred: Float): Float = 1.0f
+
+  /**
+   * perform transformation with the sum of error and weights to get the final evaluation metrics
+   */
+  def getFinal(errorSum: Float, weightSum: Float): Float = errorSum
+
+  /**
+   * get metrics' name
+   */
+  override def getMetric: String = "distributed_error_element_wise"
+}
+
+class DistributedEvalErrorMultiClasses extends IEvalMultiClassesDistributed {
+
+  /**
+   * calculate the metrics for a single row given its label and prediction
+   */
+  override def evalRow(label: Int, pred: Float, numClasses: Int): Float = {
+    1.0f + numClasses
+  }
+
+  override def getFinal(errorSum: Float, weightSum: Float): Float = errorSum
+
+  /**
+   * get metrics' name
+   */
+  override def getMetric: String = "distributed_error_multi_classes"
+}
+
+class DistributedEvalErrorRankList extends IEvalRankListDistributed {
+
+  override def evalMetric(preds: Array[Float], labels: Array[Int]): Float = {
+    0.0f
+  }
+
+  /**
+   * get metrics' name
+   */
+  override def getMetric: String = "distributed_error_ranking"
+}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
index 1affe1474f2a..f39f9d9e7d5e 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -37,7 +37,7 @@ import org.apache.spark.ml.feature.VectorAssembler
 
 class XGBoostGeneralSuite extends FunSuite with PerTest {
 
-  test("test Rabit allreduce to validate Scala-implemented Rabit tracker") {
+  ignore("test Rabit allreduce to validate Scala-implemented Rabit tracker") {
     val vectorLength = 100
     val rdd = sc.parallelize(
       (1 to numWorkers * vectorLength).toArray.map { _ => Random.nextFloat() }, numWorkers).cache()
@@ -85,8 +85,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
       List("eta" -> "1", "max_depth" -> "6",
         "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
         "custom_eval" -> null, "custom_obj" -> null, "use_external_memory" -> false,
-        "missing" -> Float.NaN).toMap,
-      hasGroup = false)
+        "missing" -> Float.NaN).toMap)
     assert(booster != null)
   }
 
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 83372a88ca7a..d85264f10e0b 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -83,6 +83,15 @@
                   </nonFilteredFileExtensions>
               </configuration>
           </plugin>
+          <plugin>
+              <groupId>org.apache.maven.plugins</groupId>
+              <artifactId>maven-compiler-plugin</artifactId>
+              <version>3.8.0</version>
+              <configuration>
+                  <source>8</source>
+                  <target>8</target>
+              </configuration>
+          </plugin>
       </plugins>
     </build>
 </project>
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 0d89d6a230ac..e72a4a908bd2 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -277,6 +277,10 @@ public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, IEvaluation eva
     return evalInfo;
   }
 
+  public long getHandle() {
+    return handle;
+  }
+
   /**
    * Advanced predict function with all the options.
    *
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalElementWiseDistributed.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalElementWiseDistributed.java
new file mode 100644
index 000000000000..b67c95979aa7
--- /dev/null
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalElementWiseDistributed.java
@@ -0,0 +1,19 @@
+package ml.dmlc.xgboost4j.java;
+
+public interface IEvalElementWiseDistributed extends IEvaluation {
+
+  /**
+   * calculate the metrics for a single row given its label and prediction
+   */
+  float evalRow(float label, float pred);
+
+  /**
+   * perform transformation with the sum of error and weights to get the final evaluation metrics
+   */
+  float getFinal(float errorSum, float weightSum);
+
+  @Override
+  default float eval(float[][] predicts, DMatrix dmat) {
+    throw new RuntimeException("IEvalElementWiseDistributed does not support eval method");
+  }
+}
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalMultiClassesDistributed.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalMultiClassesDistributed.java
new file mode 100644
index 000000000000..3c531f8658a6
--- /dev/null
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalMultiClassesDistributed.java
@@ -0,0 +1,22 @@
+package ml.dmlc.xgboost4j.java;
+
+public interface IEvalMultiClassesDistributed extends IEvaluation {
+
+  /**
+   * calculate the metrics for a single row given its label and prediction
+   */
+  float evalRow(int label, float pred, int numClasses);
+
+  /**
+   * perform transformation with the sum of error and weights to get the final evaluation metrics
+   */
+  default float getFinal(float errorSum, float weightSum) {
+    return errorSum / weightSum;
+  }
+
+  @Override
+  default float eval(float[][] predicts, DMatrix dmat) {
+    throw new RuntimeException("IEvalMultiClassesDistributed does not support eval method");
+  }
+
+}
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalRankListDistributed.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalRankListDistributed.java
new file mode 100644
index 000000000000..464d5f4ac811
--- /dev/null
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvalRankListDistributed.java
@@ -0,0 +1,11 @@
+package ml.dmlc.xgboost4j.java;
+
+public interface IEvalRankListDistributed extends IEvaluation {
+
+  float evalMetric(float[] preds, int[] labels);
+
+  @Override
+  default float eval(float[][] predicts, DMatrix dmat) {
+    throw new RuntimeException("IEvalRankingDistributed does not support eval method");
+  }
+}
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvaluation.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvaluation.java
index 7f8abece4dc0..00da19737bd8 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvaluation.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/IEvaluation.java
@@ -19,20 +19,19 @@
 
 /**
  * interface for customized evaluation
- *
- * @author hzx
  */
 public interface IEvaluation extends Serializable {
+
   /**
-   * get evaluate metric
-   *
-   * @return evalMetric
+   * get metrics' name
    */
   String getMetric();
 
   /**
    * evaluate with predicts and data
    *
+   * this method is used only for evaluation in single-host mode
+   *
    * @param predicts predictions as array
    * @param dmat     data matrix to evaluate
    * @return result of the metric
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
index b6a173bd6bd1..9d5c4e212ff0 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
@@ -24,11 +24,10 @@
 
 /**
  * trainer for xgboost
- *
- * @author hzx
  */
 public class XGBoost {
   private static final Log logger = LogFactory.getLog(XGBoost.class);
+  private static boolean initializedEval = false;
 
   /**
    * load model from modelPath
@@ -108,6 +107,41 @@ public static Booster train(
     return train(dtrain, params, round, watches, metrics, obj, eval, earlyStoppingRound, null);
   }
 
+  private static void registerNewCustomEvalForDistributed(
+          Booster booster, IEvaluation eval, String evalType) {
+    XGBoostJNI.XGBoosterAddNewMetrics(booster.getHandle(), eval.getMetric(), evalType, eval);
+  }
+
+  private static String performEvaluation(
+          Booster booster,
+          IEvaluation eval,
+          String[] evalNames,
+          DMatrix[] evalMats,
+          int iter,
+          float[] metricsOut) throws XGBoostError {
+    String evalInfo;
+    if (eval != null &&
+        !(eval instanceof IEvalElementWiseDistributed) &&
+        !(eval instanceof IEvalMultiClassesDistributed) &&
+        !(eval instanceof IEvalRankListDistributed)) {
+      evalInfo = booster.evalSet(evalMats, evalNames, eval, metricsOut);
+    } else {
+      if (eval != null) {
+        String evalType;
+        if (eval instanceof IEvalElementWiseDistributed) {
+          evalType = "regression/binary";
+        } else if (eval instanceof IEvalMultiClassesDistributed) {
+          evalType = "multi_classes";
+        } else {
+          evalType = "ranking";
+        }
+        registerNewCustomEvalForDistributed(booster, eval, evalType);
+      }
+      evalInfo = booster.evalSet(evalMats, evalNames, iter, metricsOut);
+    }
+    return evalInfo;
+  }
+
   /**
    * Train a booster given parameters.
    *
@@ -195,12 +229,8 @@ public static Booster train(
       //evaluation
       if (evalMats.length > 0) {
         float[] metricsOut = new float[evalMats.length];
-        String evalInfo;
-        if (eval != null) {
-          evalInfo = booster.evalSet(evalMats, evalNames, eval, metricsOut);
-        } else {
-          evalInfo = booster.evalSet(evalMats, evalNames, iter, metricsOut);
-        }
+        String evalInfo = performEvaluation(booster, eval, evalNames, evalMats, iter,
+                metricsOut);
         for (int i = 0; i < metricsOut.length; i++) {
           metrics[i][iter] = metricsOut[i];
         }
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
index e797d67aa3a2..3eb020814b09 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
@@ -23,8 +23,6 @@
 /**
  * xgboost JNI functions
  * change 2015-7-6: *use a long[] (length=1) as container of handle to get the output DMatrix or Booster
- *
- * @author hzx
  */
 class XGBoostJNI {
   private static final Log logger = LogFactory.getLog(DMatrix.class);
@@ -100,6 +98,10 @@ public final static native int XGBoosterEvalOneIter(long handle, int iter, long[
   public final static native int XGBoosterPredict(long handle, long dmat, int option_mask,
                                                   int ntree_limit, float[][] predicts);
 
+  public final static native int XGBoosterAddNewMetrics(long handle, String metricsName,
+                                                        String evalType,
+                                                        IEvaluation eval);
+
   public final static native int XGBoosterLoadModel(long handle, String fname);
 
   public final static native int XGBoosterSaveModel(long handle, String fname);
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala
index 609d7b2cde8c..138c6e871e75 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala
@@ -18,7 +18,7 @@ package ml.dmlc.xgboost4j.scala
 
 import java.io.InputStream
 
-import ml.dmlc.xgboost4j.java.{Booster => JBooster, XGBoost => JXGBoost, XGBoostError}
+import ml.dmlc.xgboost4j.java.{IEvaluation, XGBoostError, Booster => JBooster, XGBoost => JXGBoost}
 import scala.collection.JavaConverters._
 
 /**
@@ -52,7 +52,7 @@ object XGBoost {
       watches: Map[String, DMatrix] = Map(),
       metrics: Array[Array[Float]] = null,
       obj: ObjectiveTrait = null,
-      eval: EvalTrait = null,
+      eval: IEvaluation = null,
       earlyStoppingRound: Int = 0,
       booster: Booster = null): Booster = {
     val jWatches = watches.mapValues(_.jDMatrix).asJava
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index 2857938791a7..7124cc64bbe6 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2014 by Contributors
+  Copyright (c) 2019 by Contributors
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
@@ -17,10 +17,14 @@
 #include <xgboost/c_api.h>
 #include <xgboost/base.h>
 #include <xgboost/logging.h>
+#include <xgboost/metric/metric.h>
+#include <xgboost/metric/elementwise_metric.h>
+#include <xgboost/metric/multiclass_metric.h>
 #include "./xgboost4j.h"
 #include <cstring>
 #include <vector>
 #include <string>
+#include <xgboost/metric/ranking_metric.h>
 
 // helper functions
 // set handle
@@ -33,6 +37,8 @@ void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
   jenv->SetLongArrayRegion(jhandle, 0, 1, &out);
 }
 
+typedef void *CustomEvalHandle;  // NOLINT(*)
+
 // global JVM
 static JavaVM* global_jvm = nullptr;
 
@@ -60,6 +66,7 @@ XGB_EXTERN_C int XGBoost4jCallbackDataIterNext(
                                           "hasNext", "()Z");
     jmethodID next = jenv->GetMethodID(iterClass,
                                        "next", "()Ljava/lang/Object;");
+
     int ret_value;
     if (jenv->CallBooleanMethod(jiter, hasNext)) {
       ret_value = 1;
@@ -148,6 +155,165 @@ XGB_EXTERN_C int XGBoost4jCallbackDataIterNext(
   }
 }
 
+class CustomEvalRanking : public xgboost::metric::EvalRankList {
+public:
+  explicit CustomEvalRanking(std::string name, CustomEvalHandle handle):
+  EvalRankList(name.data(), name.data()), metrics_name(name) {
+    JNIEnv* jenv;
+    int jni_status = global_jvm->GetEnv((void **) &jenv, JNI_VERSION_1_6);
+    if (jni_status == JNI_EDETACHED) {
+      global_jvm->AttachCurrentThread(reinterpret_cast<void **>(&jenv), nullptr);
+    } else {
+      CHECK(jni_status == JNI_OK);
+    }
+    std::lock_guard<std::mutex> guard(eval_handle_mutex);
+    if (custom_eval_handle == nullptr) {
+      custom_eval_handle = jenv->NewGlobalRef(static_cast<jobject>(handle));
+    }
+  }
+
+protected:
+
+  xgboost::bst_float EvalMetric(std::vector< std::pair<xgboost::bst_float, unsigned> > &rec)
+    const override {
+    JNIEnv* jenv;
+    global_jvm->AttachCurrentThread(reinterpret_cast<void **>(&jenv), nullptr);
+    jclass eval_interface = jenv->FindClass("ml/dmlc/xgboost4j/java/IEvalRankListDistributed");
+    jmethodID eval_metrics_func = jenv->GetMethodID(eval_interface, "evalMetric", "([F[I)F");
+    jfloatArray preds = jenv->NewFloatArray(rec.size());
+    jintArray labels = jenv->NewIntArray(rec.size());
+    std::vector<jfloat> fill_float;
+    std::vector<jint> fill_int;
+    for (int i = 0; i < rec.size(); i++) {
+      fill_float.push_back(rec[i].first);
+      fill_int.push_back(rec[i].second);
+    }
+    jfloat *f1 = &fill_float[0];
+    jint *f2 = &fill_int[0];
+    jenv->SetFloatArrayRegion(preds, 0, rec.size(), f1);
+    jenv->SetIntArrayRegion(labels, 0, rec.size(), f2);
+    return jenv->CallFloatMethod(custom_eval_handle, eval_metrics_func, preds, labels);
+  }
+
+private:
+  std::string metrics_name;
+
+  static jobject custom_eval_handle;
+  /*! \brief lock guarding the registering*/
+  static std::mutex eval_handle_mutex;
+};
+
+std::mutex CustomEvalRanking::eval_handle_mutex;
+jobject CustomEvalRanking::custom_eval_handle = nullptr;
+
+class CustomEvalMultiClasses : public xgboost::metric::EvalMClassBase<CustomEvalMultiClasses> {
+public:
+  CustomEvalMultiClasses(std::string name, CustomEvalHandle handle): metrics_name(name) {
+    JNIEnv* jenv;
+    int jni_status = global_jvm->GetEnv((void **) &jenv, JNI_VERSION_1_6);
+    if (jni_status == JNI_EDETACHED) {
+      global_jvm->AttachCurrentThread(reinterpret_cast<void **>(&jenv), nullptr);
+    } else {
+      CHECK(jni_status == JNI_OK);
+    }
+    std::lock_guard<std::mutex> guard(eval_handle_mutex);
+    if (custom_eval_handle == nullptr) {
+      custom_eval_handle = jenv->NewGlobalRef(static_cast<jobject>(handle));
+    }
+  }
+
+  inline static xgboost::bst_float EvalRow(int label,
+                                           const xgboost::bst_float *pred,
+                                           int nclass) {
+    JNIEnv* jenv;
+    global_jvm->AttachCurrentThread(reinterpret_cast<void **>(&jenv), nullptr);
+    jclass eval_interface = jenv->FindClass("ml/dmlc/xgboost4j/java/IEvalMultiClassesDistributed");
+    jmethodID eval_row_func = jenv->GetMethodID(eval_interface, "evalRow", "(IFI)F");
+    return jenv->CallFloatMethod(custom_eval_handle, eval_row_func, label, *pred, nclass);
+  }
+
+  static xgboost::bst_float GetFinal(xgboost::bst_float esum, xgboost::bst_float wsum) {
+    JNIEnv* jenv;
+    global_jvm->AttachCurrentThread(reinterpret_cast<void **>(&jenv), nullptr);
+    jclass eval_interface = jenv->FindClass("ml/dmlc/xgboost4j/java/IEvalMultiClassesDistributed");
+    jmethodID get_final_func = jenv->GetMethodID(eval_interface, "getFinal", "(FF)F");
+    return jenv->CallFloatMethod(custom_eval_handle, get_final_func, esum, wsum);
+  }
+
+  const char *Name() const {
+    return metrics_name.data();
+  }
+
+private:
+  std::string metrics_name;
+
+  static jobject custom_eval_handle;
+  /*! \brief lock guarding the registering*/
+  static std::mutex eval_handle_mutex;
+};
+
+std::mutex CustomEvalMultiClasses::eval_handle_mutex;
+jobject CustomEvalMultiClasses::custom_eval_handle = nullptr;
+
+class CustomEvalElementWise {
+public:
+  CustomEvalElementWise(std::string name, CustomEvalHandle handle):
+    metrics_name(name) {
+    JNIEnv* jenv;
+    int jni_status = global_jvm->GetEnv((void **) &jenv, JNI_VERSION_1_6);
+    if (jni_status == JNI_EDETACHED) {
+      global_jvm->AttachCurrentThread(reinterpret_cast<void **>(&jenv), nullptr);
+    } else {
+      CHECK(jni_status == JNI_OK);
+    }
+    std::lock_guard<std::mutex> guard(eval_handle_mutex);
+    if (custom_eval_handle == nullptr) {
+      custom_eval_handle = jenv->NewGlobalRef(static_cast<jobject>(handle));
+    }
+  }
+
+  XGBOOST_DEVICE xgboost::bst_float EvalRow(xgboost::bst_float label,
+          xgboost::bst_float pred) const {
+    JNIEnv* jenv;
+    int jni_status = global_jvm->GetEnv((void **) &jenv, JNI_VERSION_1_6);
+    if (jni_status == JNI_EDETACHED) {
+      global_jvm->AttachCurrentThread(reinterpret_cast<void **>(&jenv), nullptr);
+    } else {
+      CHECK(jni_status == JNI_OK);
+    }
+    jclass eval_interface = jenv->FindClass("ml/dmlc/xgboost4j/java/IEvalElementWiseDistributed");
+    jmethodID eval_row_func = jenv->GetMethodID(eval_interface, "evalRow", "(FF)F");
+    return jenv->CallFloatMethod(custom_eval_handle, eval_row_func, label, pred);
+  }
+
+  static xgboost::bst_float GetFinal(xgboost::bst_float esum, xgboost::bst_float wsum) {
+    JNIEnv* jenv;
+    int jni_status = global_jvm->GetEnv((void **) &jenv, JNI_VERSION_1_6);
+    if (jni_status == JNI_EDETACHED) {
+      global_jvm->AttachCurrentThread(reinterpret_cast<void **>(&jenv), nullptr);
+    } else {
+      CHECK(jni_status == JNI_OK);
+    }
+    jclass eval_interface = jenv->FindClass("ml/dmlc/xgboost4j/java/IEvalElementWiseDistributed");
+    jmethodID get_final_func = jenv->GetMethodID(eval_interface, "getFinal", "(FF)F");
+    return jenv->CallFloatMethod(custom_eval_handle, get_final_func, esum, wsum);
+  }
+
+  const char *Name() const {
+    return metrics_name.data();
+  }
+
+private:
+  std::string metrics_name;
+
+  static jobject custom_eval_handle;
+  /*! \brief lock guarding the registering*/
+  static std::mutex eval_handle_mutex;
+};
+
+std::mutex CustomEvalElementWise::eval_handle_mutex;
+jobject CustomEvalElementWise::custom_eval_handle = nullptr;
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBGetLastError
@@ -769,6 +935,43 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetAttr
   return ret;
 }
 
+std::mutex registering_mutex;
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterAddNewMetrics
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterAddNewMetrics
+  (JNIEnv *jenv, jclass jcls, jlong jhandle, jstring metrics_name,
+          jstring eval_type, jobject custom_eval) {
+  std::string metrics_name_in_str = jenv->GetStringUTFChars(metrics_name, 0);
+  std::string eval_type_in_str = jenv->GetStringUTFChars(eval_type, 0);
+  registering_mutex.lock();
+  if (eval_type_in_str == "regression/binary") {
+    XGBOOST_REGISTER_METRIC(CUSTOM_METRICS, metrics_name_in_str)
+            .describe("customized metrics for binary/regression")
+            .set_body([=](const char *param) {
+              return new xgboost::metric::EvalEWiseBase<CustomEvalElementWise>(
+                      *(new CustomEvalElementWise(metrics_name_in_str, custom_eval)));
+            });
+  } else if (eval_type_in_str == "multi_classes") {
+    XGBOOST_REGISTER_METRIC(CUSTOM_METRICS, metrics_name_in_str)
+            .describe("customized metrics for multi_classes")
+            .set_body([=](const char *param) {
+              return new CustomEvalMultiClasses(metrics_name_in_str, custom_eval);
+            });
+  } else if (eval_type_in_str == "ranking") {
+    XGBOOST_REGISTER_METRIC(CUSTOM_METRICS, metrics_name_in_str)
+            .describe("customized metrics for ranking")
+            .set_body([=](const char *param) {
+              return new CustomEvalRanking(metrics_name_in_str, custom_eval);
+            });
+  }
+  registering_mutex.unlock();
+  XGBoosterRegisterNewMetrics((BoosterHandle) jhandle, metrics_name_in_str.data());
+  return 0;
+}
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterLoadRabitCheckpoint
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.h b/jvm-packages/xgboost4j/src/native/xgboost4j.h
index 96eaa97b27cc..e1b3330b2e40 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.h
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.h
@@ -255,6 +255,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetAttr
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetAttr
   (JNIEnv *, jclass, jlong, jstring, jstring);
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterAddNewMetrics
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterAddNewMetrics
+  (JNIEnv *, jclass, jlong, jstring, jstring, jobject);
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterLoadRabitCheckpoint
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ac9c35c4c7de..706f9fd0e9a3 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1157,5 +1157,16 @@ QueryBoosterConfigurationArguments(BoosterHandle handle) {
   return bst->learner()->GetConfigurationArguments();
 }
 
+XGB_DLL void XGBoosterRegisterNewMetrics(BoosterHandle handle, const char* metrics_name) {
+  auto* bst = static_cast<xgboost::Booster*>(handle);
+  // note: this function is only called by jvm packages which does not support multiple
+  // evaluation metrics for now,
+  // as a result, we clear all registered metrics and add the new customized one
+  bst->learner()->metrics_.clear();
+  bst->learner()->metrics_.emplace_back(Metric::Create(metrics_name));
+  bst->learner()->metrics_.back()->Configure(bst->learner()->GetConfigurationArguments().begin(),
+                                             bst->learner()->GetConfigurationArguments().end());
+}
+
 // force link rabit
 static DMLC_ATTRIBUTE_UNUSED int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag();
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index a9221be849bf..e220f714187f 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -1,15 +1,14 @@
 /*!
  * Copyright 2015-2019 by Contributors
- * \file elementwise_metric.cc
- * \brief evaluation metrics for elementwise binary or regression.
- * \author Kailong Chen, Tianqi Chen
  */
+
 #include <rabit/rabit.h>
-#include <xgboost/metric.h>
+#include <xgboost/metric/metric.h>
+#include <xgboost/metric/metric_common.h>
+#include <xgboost/metric/elementwise_metric.h>
 #include <dmlc/registry.h>
 #include <cmath>
 
-#include "metric_common.h"
 #include "../common/math.h"
 #include "../common/common.h"
 
@@ -22,123 +21,14 @@
 #include "../common/device_helpers.cuh"
 #endif  // XGBOOST_USE_CUDA
 
+#include "../common/math.h"
+#include "../common/common.h"
+
 namespace xgboost {
 namespace metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(elementwise_metric);
 
-template <typename EvalRow>
-class ElementWiseMetricsReduction {
- public:
-  explicit ElementWiseMetricsReduction(EvalRow policy) :
-    policy_(std::move(policy)) {}
-
-  PackedReduceResult CpuReduceMetrics(
-      const HostDeviceVector<bst_float>& weights,
-      const HostDeviceVector<bst_float>& labels,
-      const HostDeviceVector<bst_float>& preds) const {
-    size_t ndata = labels.Size();
-
-    const auto& h_labels = labels.HostVector();
-    const auto& h_weights = weights.HostVector();
-    const auto& h_preds = preds.HostVector();
-
-    bst_float residue_sum = 0;
-    bst_float weights_sum = 0;
-
-#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
-    for (omp_ulong i = 0; i < ndata; ++i) {
-      const bst_float wt = h_weights.size() > 0 ? h_weights[i] : 1.0f;
-      residue_sum += policy_.EvalRow(h_labels[i], h_preds[i]) * wt;
-      weights_sum += wt;
-    }
-    PackedReduceResult res { residue_sum, weights_sum };
-    return res;
-  }
-
-#if defined(XGBOOST_USE_CUDA)
-
-  PackedReduceResult DeviceReduceMetrics(
-      GPUSet::GpuIdType device_id,
-      size_t device_index,
-      const HostDeviceVector<bst_float>& weights,
-      const HostDeviceVector<bst_float>& labels,
-      const HostDeviceVector<bst_float>& preds) {
-    size_t n_data = preds.DeviceSize(device_id);
-
-    thrust::counting_iterator<size_t> begin(0);
-    thrust::counting_iterator<size_t> end = begin + n_data;
-
-    auto s_label = labels.DeviceSpan(device_id);
-    auto s_preds = preds.DeviceSpan(device_id);
-    auto s_weights = weights.DeviceSpan(device_id);
-
-    bool const is_null_weight = weights.Size() == 0;
-
-    auto d_policy = policy_;
-
-    PackedReduceResult result = thrust::transform_reduce(
-        thrust::cuda::par(allocators_.at(device_index)),
-        begin, end,
-        [=] XGBOOST_DEVICE(size_t idx) {
-          bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
-
-          bst_float residue = d_policy.EvalRow(s_label[idx], s_preds[idx]);
-          residue *= weight;
-          return PackedReduceResult{ residue, weight };
-        },
-        PackedReduceResult(),
-        thrust::plus<PackedReduceResult>());
-
-    return result;
-  }
-
-#endif  // XGBOOST_USE_CUDA
-
-  PackedReduceResult Reduce(
-      GPUSet devices,
-      const HostDeviceVector<bst_float>& weights,
-      const HostDeviceVector<bst_float>& labels,
-      const HostDeviceVector<bst_float>& preds) {
-    PackedReduceResult result;
-
-    if (devices.IsEmpty()) {
-      result = CpuReduceMetrics(weights, labels, preds);
-    }
-#if defined(XGBOOST_USE_CUDA)
-    else {  // NOLINT
-      if (allocators_.size() != devices.Size()) {
-        allocators_.clear();
-        allocators_.resize(devices.Size());
-      }
-      preds.Reshard(devices);
-      labels.Reshard(devices);
-      weights.Reshard(devices);
-      std::vector<PackedReduceResult> res_per_device(devices.Size());
-
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
-        dh::safe_cuda(cudaSetDevice(id));
-        size_t index = devices.Index(id);
-        res_per_device.at(index) =
-            DeviceReduceMetrics(id, index, weights, labels, preds);
-      }
-
-      for (auto const& res : res_per_device) {
-        result += res;
-      }
-    }
-#endif  // defined(XGBOOST_USE_CUDA)
-    return result;
-  }
-
- private:
-  EvalRow policy_;
-#if defined(XGBOOST_USE_CUDA)
-  std::vector<dh::CubMemory> allocators_;
-#endif  // defined(XGBOOST_USE_CUDA)
-};
-
 struct EvalRowRMSE {
   char const *Name() const {
     return "rmse";
@@ -304,53 +194,6 @@ struct EvalTweedieNLogLik {
  protected:
   bst_float rho_;
 };
-/*!
- * \brief base class of element-wise evaluation
- * \tparam Derived the name of subclass
- */
-template<typename Policy>
-struct EvalEWiseBase : public Metric {
-  EvalEWiseBase() : policy_{}, reducer_{policy_} {}
-  explicit EvalEWiseBase(char const* policy_param) :
-    policy_{policy_param}, reducer_{policy_} {}
-
-  void Configure(
-      const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.InitAllowUnknown(args);
-  }
-
-  bst_float Eval(const HostDeviceVector<bst_float>& preds,
-                 const MetaInfo& info,
-                 bool distributed) override {
-    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.Size(), info.labels_.Size())
-        << "label and prediction size not match, "
-        << "hint: use merror or mlogloss for multi-class classification";
-    const auto ndata = static_cast<omp_ulong>(info.labels_.Size());
-    // Dealing with ndata < n_gpus.
-    GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata);
-
-    auto result =
-        reducer_.Reduce(devices, info.weights_, info.labels_, preds);
-
-    double dat[2] { result.Residue(), result.Weights() };
-    if (distributed) {
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-    }
-    return Policy::GetFinal(dat[0], dat[1]);
-  }
-
-  const char* Name() const override {
-    return policy_.Name();
-  }
-
- private:
-  Policy policy_;
-
-  MetricParam param_;
-
-  ElementWiseMetricsReduction<Policy> reducer_;
-};
 
 XGBOOST_REGISTER_METRIC(RMSE, "rmse")
 .describe("Rooted mean square error.")
diff --git a/src/metric/metric.cc b/src/metric/metric.cc
index 8d3d9d9280cc..8074b6f07701 100644
--- a/src/metric/metric.cc
+++ b/src/metric/metric.cc
@@ -3,10 +3,10 @@
  * \file metric_registry.cc
  * \brief Registry of objective functions.
  */
-#include <xgboost/metric.h>
-#include <dmlc/registry.h>
+#include <xgboost/metric/metric.h>
+#include <xgboost/metric/metric_common.h>
 
-#include "metric_common.h"
+#include <dmlc/registry.h>
 
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
diff --git a/src/metric/multiclass_metric.cc b/src/metric/multiclass_metric.cc
index 7733a334f5c0..6e721e3b68fd 100644
--- a/src/metric/multiclass_metric.cc
+++ b/src/metric/multiclass_metric.cc
@@ -6,3 +6,4 @@
 #if !defined(XGBOOST_USE_CUDA)
 #include "multiclass_metric.cu"
 #endif  // !defined(XGBOOST_USE_CUDA)
+
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index 88af0014ed5a..54d879923aea 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -5,221 +5,19 @@
  * \author Kailong Chen, Tianqi Chen
  */
 #include <rabit/rabit.h>
-#include <xgboost/metric.h>
+#include <xgboost/metric/metric.h>
+#include <xgboost/metric/metric_common.h>
+#include <xgboost/metric/multiclass_metric.h>
 #include <cmath>
 
-#include "metric_common.h"
 #include "../common/math.h"
 #include "../common/common.h"
 
-#if defined(XGBOOST_USE_CUDA)
-#include <thrust/execution_policy.h>  // thrust::cuda::par
-#include <thrust/functional.h>        // thrust::plus<>
-#include <thrust/transform_reduce.h>
-#include <thrust/iterator/counting_iterator.h>
-
-#include "../common/device_helpers.cuh"
-#endif  // XGBOOST_USE_CUDA
-
 namespace xgboost {
 namespace metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(multiclass_metric);
 
-template <typename EvalRowPolicy>
-class MultiClassMetricsReduction {
-  void CheckLabelError(int32_t label_error, size_t n_class) const {
-    CHECK(label_error >= 0 && label_error < static_cast<int32_t>(n_class))
-        << "MultiClassEvaluation: label must be in [0, num_class),"
-        << " num_class=" << n_class << " but found " << label_error << " in label";
-  }
-
- public:
-  MultiClassMetricsReduction() = default;
-
-  PackedReduceResult CpuReduceMetrics(
-      const HostDeviceVector<bst_float>& weights,
-      const HostDeviceVector<bst_float>& labels,
-      const HostDeviceVector<bst_float>& preds,
-      const size_t n_class) const {
-    size_t ndata = labels.Size();
-
-    const auto& h_labels = labels.HostVector();
-    const auto& h_weights = weights.HostVector();
-    const auto& h_preds = preds.HostVector();
-
-    bst_float residue_sum = 0;
-    bst_float weights_sum = 0;
-    int label_error = 0;
-    bool const is_null_weight = weights.Size() == 0;
-
-#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
-    for (omp_ulong idx = 0; idx < ndata; ++idx) {
-      bst_float weight = is_null_weight ? 1.0f : h_weights[idx];
-      auto label = static_cast<int>(h_labels[idx]);
-      if (label >= 0 && label < static_cast<int>(n_class)) {
-        residue_sum += EvalRowPolicy::EvalRow(
-            label, h_preds.data() + idx * n_class, n_class) * weight;
-        weights_sum += weight;
-      } else {
-        label_error = label;
-      }
-    }
-    CheckLabelError(label_error, n_class);
-    PackedReduceResult res { residue_sum, weights_sum };
-
-    return res;
-  }
-
-#if defined(XGBOOST_USE_CUDA)
-
-  PackedReduceResult DeviceReduceMetrics(
-      GPUSet::GpuIdType device_id,
-      size_t device_index,
-      const HostDeviceVector<bst_float>& weights,
-      const HostDeviceVector<bst_float>& labels,
-      const HostDeviceVector<bst_float>& preds,
-      const size_t n_class) {
-    size_t n_data = labels.DeviceSize(device_id);
-
-    thrust::counting_iterator<size_t> begin(0);
-    thrust::counting_iterator<size_t> end = begin + n_data;
-
-    auto s_labels = labels.DeviceSpan(device_id);
-    auto s_preds = preds.DeviceSpan(device_id);
-    auto s_weights = weights.DeviceSpan(device_id);
-
-    bool const is_null_weight = weights.Size() == 0;
-    auto s_label_error = label_error_.GetSpan<int32_t>(1);
-    s_label_error[0] = 0;
-
-    PackedReduceResult result = thrust::transform_reduce(
-        thrust::cuda::par(allocators_.at(device_index)),
-        begin, end,
-        [=] XGBOOST_DEVICE(size_t idx) {
-          bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
-          bst_float residue = 0;
-          auto label = static_cast<int>(s_labels[idx]);
-          if (label >= 0 && label < static_cast<int32_t>(n_class)) {
-            residue = EvalRowPolicy::EvalRow(
-                label, &s_preds[idx * n_class], n_class) * weight;
-          } else {
-            s_label_error[0] = label;
-          }
-          return PackedReduceResult{ residue, weight };
-        },
-        PackedReduceResult(),
-        thrust::plus<PackedReduceResult>());
-    CheckLabelError(s_label_error[0], n_class);
-
-    return result;
-  }
-
-#endif  // XGBOOST_USE_CUDA
-
-  PackedReduceResult Reduce(
-      GPUSet devices,
-      size_t n_class,
-      const HostDeviceVector<bst_float>& weights,
-      const HostDeviceVector<bst_float>& labels,
-      const HostDeviceVector<bst_float>& preds) {
-    PackedReduceResult result;
-
-    if (devices.IsEmpty()) {
-      result = CpuReduceMetrics(weights, labels, preds, n_class);
-    }
-#if defined(XGBOOST_USE_CUDA)
-    else {  // NOLINT
-      if (allocators_.size() != devices.Size()) {
-        allocators_.clear();
-        allocators_.resize(devices.Size());
-      }
-      preds.Reshard(GPUDistribution::Granular(devices, n_class));
-      labels.Reshard(devices);
-      weights.Reshard(devices);
-      std::vector<PackedReduceResult> res_per_device(devices.Size());
-
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
-        dh::safe_cuda(cudaSetDevice(id));
-        size_t index = devices.Index(id);
-        res_per_device.at(index) =
-            DeviceReduceMetrics(id, index, weights, labels, preds, n_class);
-      }
-
-      for (auto const& res : res_per_device) {
-        result += res;
-      }
-    }
-#endif  // defined(XGBOOST_USE_CUDA)
-    return result;
-  }
-
- private:
-#if defined(XGBOOST_USE_CUDA)
-  dh::PinnedMemory label_error_;
-  std::vector<dh::CubMemory> allocators_;
-#endif  // defined(XGBOOST_USE_CUDA)
-};
-
-/*!
- * \brief base class of multi-class evaluation
- * \tparam Derived the name of subclass
- */
-template<typename Derived>
-struct EvalMClassBase : public Metric {
-  void Configure(
-      const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.InitAllowUnknown(args);
-  }
-
-  bst_float Eval(const HostDeviceVector<bst_float> &preds,
-                 const MetaInfo &info,
-                 bool distributed) override {
-    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
-    CHECK(preds.Size() % info.labels_.Size() == 0)
-        << "label and prediction size not match";
-    const size_t nclass = preds.Size() / info.labels_.Size();
-    CHECK_GE(nclass, 1U)
-        << "mlogloss and merror are only used for multi-class classification,"
-        << " use logloss for binary classification";
-    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
-
-    GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata);
-    auto result = reducer_.Reduce(devices, nclass, info.weights_, info.labels_, preds);
-    double dat[2] { result.Residue(), result.Weights() };
-
-    if (distributed) {
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-    }
-    return Derived::GetFinal(dat[0], dat[1]);
-  }
-  /*!
-   * \brief to be implemented by subclass,
-   *   get evaluation result from one row
-   * \param label label of current instance
-   * \param pred prediction value of current instance
-   * \param nclass number of class in the prediction
-   */
-  XGBOOST_DEVICE static bst_float EvalRow(int label,
-                                          const bst_float *pred,
-                                          size_t nclass);
-  /*!
-   * \brief to be overridden by subclass, final transformation
-   * \param esum the sum statistics returned by EvalRow
-   * \param wsum sum of weight
-   */
-  inline static bst_float GetFinal(bst_float esum, bst_float wsum) {
-    return esum / wsum;
-  }
-
- private:
-  MultiClassMetricsReduction<Derived> reducer_;
-  MetricParam param_;
-  // used to store error message
-  const char *error_msg_;
-};
-
 /*! \brief match error */
 struct EvalMatchError : public EvalMClassBase<EvalMatchError> {
   const char* Name() const override {
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 43a5a2333924..518816bd9960 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -1,11 +1,11 @@
 /*!
- * Copyright 2015 by Contributors
+ * Copyright 2019 by Contributors
  * \file rank_metric.cc
  * \brief prediction rank based metrics.
- * \author Kailong Chen, Tianqi Chen
  */
 #include <rabit/rabit.h>
-#include <xgboost/metric.h>
+#include <xgboost/metric/metric.h>
+#include <xgboost/metric/ranking_metric.h>
 #include <dmlc/registry.h>
 #include <cmath>
 
@@ -159,83 +159,6 @@ struct EvalAuc : public Metric {
   }
 };
 
-/*! \brief Evaluate rank list */
-struct EvalRankList : public Metric {
- public:
-  bst_float Eval(const HostDeviceVector<bst_float> &preds,
-                 const MetaInfo &info,
-                 bool distributed) override {
-    CHECK_EQ(preds.Size(), info.labels_.Size())
-        << "label size predict size not match";
-    // quick consistency when group is not available
-    std::vector<unsigned> tgptr(2, 0);
-    tgptr[1] = static_cast<unsigned>(preds.Size());
-    const std::vector<unsigned> &gptr = info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_;
-    CHECK_NE(gptr.size(), 0U) << "must specify group when constructing rank file";
-    CHECK_EQ(gptr.back(), preds.Size())
-        << "EvalRanklist: group structure must match number of prediction";
-    const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    // sum statistics
-    double sum_metric = 0.0f;
-    const auto& labels = info.labels_.HostVector();
-
-    const std::vector<bst_float>& h_preds = preds.HostVector();
-#pragma omp parallel reduction(+:sum_metric)
-    {
-      // each thread takes a local rec
-      std::vector< std::pair<bst_float, unsigned> > rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroup; ++k) {
-        rec.clear();
-        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-          rec.emplace_back(h_preds[j], static_cast<int>(labels[j]));
-        }
-        sum_metric += this->EvalMetric(rec);
-      }
-    }
-    if (distributed) {
-      bst_float dat[2];
-      dat[0] = static_cast<bst_float>(sum_metric);
-      dat[1] = static_cast<bst_float>(ngroup);
-      // approximately estimate the metric using mean
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-      return dat[0] / dat[1];
-    } else {
-      return static_cast<bst_float>(sum_metric) / ngroup;
-    }
-  }
-  const char* Name() const override {
-    return name_.c_str();
-  }
-
- protected:
-  explicit EvalRankList(const char* name, const char* param) {
-    using namespace std;  // NOLINT(*)
-    minus_ = false;
-    if (param != nullptr) {
-      std::ostringstream os;
-      os << name << '@' << param;
-      name_ = os.str();
-      if (sscanf(param, "%u[-]?", &topn_) != 1) {
-        topn_ = std::numeric_limits<unsigned>::max();
-      }
-      if (param[strlen(param) - 1] == '-') {
-        minus_ = true;
-      }
-    } else {
-      name_ = name;
-      topn_ = std::numeric_limits<unsigned>::max();
-    }
-  }
-  /*! \return evaluation metric, given the pair_sort record, (pred,label) */
-  virtual bst_float EvalMetric(std::vector<std::pair<bst_float, unsigned> > &pair_sort) const = 0; // NOLINT(*)
-
- protected:
-  unsigned topn_;
-  std::string name_;
-  bool minus_;
-};
-
 /*! \brief Precision at N, for both classification and rank */
 struct EvalPrecision : public EvalRankList{
  public:
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 33dd072f2c5e..718f124e08ad 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -16,8 +16,8 @@
 
 #include <xgboost/base.h>
 #include <xgboost/objective.h>
-#include <xgboost/metric.h>
 #include <xgboost/predictor.h>
+#include <xgboost/metric/metric.h>
 
 #if defined(__CUDACC__)
 #define DeclareUnifiedTest(name) GPU ## name
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index f4e3f413719b..62133ffecb5f 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -1,7 +1,7 @@
 /*!
  * Copyright 2018 XGBoost contributors
  */
-#include <xgboost/metric.h>
+#include <xgboost/metric/metric.h>
 #include <map>
 #include "../helpers.h"
 
diff --git a/tests/cpp/metric/test_metric.cc b/tests/cpp/metric/test_metric.cc
index 8311663b0b8d..46a1a6559d70 100644
--- a/tests/cpp/metric/test_metric.cc
+++ b/tests/cpp/metric/test_metric.cc
@@ -1,5 +1,5 @@
 // Copyright by Contributors
-#include <xgboost/metric.h>
+#include <xgboost/metric/metric.h>
 
 #include "../helpers.h"
 
diff --git a/tests/cpp/metric/test_multiclass_metric.cc b/tests/cpp/metric/test_multiclass_metric.cc
index 79954784593a..59f16128b066 100644
--- a/tests/cpp/metric/test_multiclass_metric.cc
+++ b/tests/cpp/metric/test_multiclass_metric.cc
@@ -1,5 +1,5 @@
 // Copyright by Contributors
-#include <xgboost/metric.h>
+#include <xgboost/metric/metric.h>
 #include <string>
 
 #include "../helpers.h"
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index e8082fc67b68..2d0e1f74560b 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -1,5 +1,5 @@
 // Copyright by Contributors
-#include <xgboost/metric.h>
+#include <xgboost/metric/metric.h>
 
 #include "../helpers.h"