From 2f3edf1d7a4d22b50f3173b678e6fdd247626ee6 Mon Sep 17 00:00:00 2001 From: booksword Date: Sun, 24 Jul 2022 16:01:11 +0800 Subject: [PATCH 1/8] bug fix: 1) 100 should be used to scale down percentileofscore return to 0-1, not length of array; 2) for (linear) weighted MA(n), weight should be n, n-1, ..., 1 instead of n-1, ..., 0 --- qlib/data/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 1cbb1d2e628..2fe6acbd6d1 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -1165,7 +1165,7 @@ def rank(x): x1 = x[~np.isnan(x)] if x1.shape[0] == 0: return np.nan - return percentileofscore(x1, x1[-1]) / len(x1) + return percentileofscore(x1, x1[-1]) / 100 if self.N == 0: series = series.expanding(min_periods=1).apply(rank, raw=True) @@ -1341,7 +1341,7 @@ def _load_internal(self, instrument, start_index, end_index, *args): # TODO: implement in Cython def weighted_mean(x): - w = np.arange(len(x)) + w = np.arange(len(x)) + 1 w = w / w.sum() return np.nanmean(w * x) From 9e88ac39c46e295a909d348226cd62adb87517b3 Mon Sep 17 00:00:00 2001 From: booksword Date: Sat, 13 Aug 2022 09:42:02 +0800 Subject: [PATCH 2/8] use native pandas fucntion for rank --- qlib/data/ops.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 2fe6acbd6d1..63aad7b3b25 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -34,8 +34,6 @@ #################### Element-Wise Operator #################### - - class ElemOperator(ExpressionOps): """Element-wise Operator @@ -216,9 +214,7 @@ class Not(NpElemOperator): Parameters ---------- - feature_left : Expression - feature instance - feature_right : Expression + feature : Expression feature instance Returns @@ -241,8 +237,6 @@ class PairOperator(ExpressionOps): feature instance or numeric value feature_right : Expression feature instance or numeric value - func : str - operator function Returns ---------- @@ -1157,20 +1151,11 @@ def __init__(self, feature, N): def _load_internal(self, instrument, start_index, end_index, *args): series = self.feature.load(instrument, start_index, end_index, *args) - # TODO: implement in Cython - - def rank(x): - if np.isnan(x[-1]): - return np.nan - x1 = x[~np.isnan(x)] - if x1.shape[0] == 0: - return np.nan - return percentileofscore(x1, x1[-1]) / 100 if self.N == 0: - series = series.expanding(min_periods=1).apply(rank, raw=True) + series = series.expanding(min_periods=1).rank(pct=True) else: - series = series.rolling(self.N, min_periods=1).apply(rank, raw=True) + series = series.rolling(self.N, min_periods=1).rank(pct=True) return series From 11785c9e4ed3fd4f33e0f2c9776f488c45dfea88 Mon Sep 17 00:00:00 2001 From: booksword Date: Sat, 13 Aug 2022 09:46:16 +0800 Subject: [PATCH 3/8] remove useless import --- qlib/data/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 63aad7b3b25..a2805700aff 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -9,7 +9,6 @@ import pandas as pd from typing import Union, List, Type -from scipy.stats import percentileofscore from .base import Expression, ExpressionOps, Feature, PFeature from ..log import get_module_logger from ..utils import get_callable_kwargs From b4f84082667ba1c44fe569a373eed48956028310 Mon Sep 17 00:00:00 2001 From: BookSword Date: Tue, 23 Aug 2022 16:19:17 +0800 Subject: [PATCH 4/8] require pandas 1.4+ --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0ca9f26ba97..ebd81bc2a57 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ def get_version(rel_path: str) -> str: # `estimator` may depend on other packages. In order to reduce dependencies, it is not written here. REQUIRED = [ "numpy>=1.12.0", - "pandas>=0.25.1", + "pandas>=1.4.0", "scipy>=1.0.0", "requests>=2.18.0", "sacred>=0.7.4", From 223f2c79163f1333609a3fab3ad07b3d25a73093 Mon Sep 17 00:00:00 2001 From: BookSword Date: Tue, 23 Aug 2022 18:52:25 +0800 Subject: [PATCH 5/8] rank for py37+pandas 1.3.5 compatibility --- qlib/data/ops.py | 25 +++++++++++++++++++++++-- setup.py | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index a2805700aff..93b3266a39e 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -9,6 +9,7 @@ import pandas as pd from typing import Union, List, Type +from scipy.stats import percentileofscore from .base import Expression, ExpressionOps, Feature, PFeature from ..log import get_module_logger from ..utils import get_callable_kwargs @@ -1147,16 +1148,36 @@ class Rank(Rolling): def __init__(self, feature, N): super(Rank, self).__init__(feature, N, "rank") + major_version, minor_version, *_ = pd.__version__.split('.') + self._load_internal = self._load_internal_pd14 \ + if int(major_version) > 1 or int(major_version) == 1 and \ + int(minor_version) >3 else self._load_internal_pd_below_13 - def _load_internal(self, instrument, start_index, end_index, *args): + def _load_internal_pd14(self, instrument, start_index, end_index, *args): series = self.feature.load(instrument, start_index, end_index, *args) - if self.N == 0: series = series.expanding(min_periods=1).rank(pct=True) else: series = series.rolling(self.N, min_periods=1).rank(pct=True) return series + # for compatiblity of python 3.7, which doesn't support pandas 1.4.0+ which implements Rolling.rank + def _load_internal_pd_below_13(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) + def rank(x): + if np.isnan(x[-1]): + return np.nan + x1 = x[~np.isnan(x)] + if x1.shape[0] == 0: + return np.nan + return percentileofscore(x1, x1[-1]) / 100 + + if self.N == 0: + series = series.expanding(min_periods=1).apply(rank, raw=True) + else: + series = series.rolling(self.N, min_periods=1).apply(rank, raw=True) + return series + class Count(Rolling): """Rolling Count diff --git a/setup.py b/setup.py index ebd81bc2a57..0ca9f26ba97 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ def get_version(rel_path: str) -> str: # `estimator` may depend on other packages. In order to reduce dependencies, it is not written here. REQUIRED = [ "numpy>=1.12.0", - "pandas>=1.4.0", + "pandas>=0.25.1", "scipy>=1.0.0", "requests>=2.18.0", "sacred>=0.7.4", From ad05334266f89f9ac8cf72828ac4e1e20af6bbd1 Mon Sep 17 00:00:00 2001 From: BookSword Date: Tue, 23 Aug 2022 19:02:03 +0800 Subject: [PATCH 6/8] lint improvement --- qlib/data/ops.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 93b3266a39e..b1f4d41f192 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -1149,9 +1149,11 @@ class Rank(Rolling): def __init__(self, feature, N): super(Rank, self).__init__(feature, N, "rank") major_version, minor_version, *_ = pd.__version__.split('.') - self._load_internal = self._load_internal_pd14 \ - if int(major_version) > 1 or int(major_version) == 1 and \ - int(minor_version) >3 else self._load_internal_pd_below_13 + self._load_internal = ( + self._load_internal_pd14 + if int(major_version) > 1 or int(major_version) == 1 and int(minor_version) > 3 + else self._load_internal_pd_below_13 + ) def _load_internal_pd14(self, instrument, start_index, end_index, *args): series = self.feature.load(instrument, start_index, end_index, *args) @@ -1164,6 +1166,7 @@ def _load_internal_pd14(self, instrument, start_index, end_index, *args): # for compatiblity of python 3.7, which doesn't support pandas 1.4.0+ which implements Rolling.rank def _load_internal_pd_below_13(self, instrument, start_index, end_index, *args): series = self.feature.load(instrument, start_index, end_index, *args) + def rank(x): if np.isnan(x[-1]): return np.nan From 33ed6c357fd518d5f1dea172efece0d5fab08b08 Mon Sep 17 00:00:00 2001 From: BookSword Date: Tue, 23 Aug 2022 19:45:15 +0800 Subject: [PATCH 7/8] lint black fix --- qlib/data/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index b1f4d41f192..cfc1ed9b1bd 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -1148,7 +1148,7 @@ class Rank(Rolling): def __init__(self, feature, N): super(Rank, self).__init__(feature, N, "rank") - major_version, minor_version, *_ = pd.__version__.split('.') + major_version, minor_version, *_ = pd.__version__.split(".") self._load_internal = ( self._load_internal_pd14 if int(major_version) > 1 or int(major_version) == 1 and int(minor_version) > 3 From ee68adccdc8c3b5a96c516847a65167083c55f9d Mon Sep 17 00:00:00 2001 From: BookSword Date: Tue, 30 Aug 2022 14:42:12 +0800 Subject: [PATCH 8/8] use hasattr instead of version to check whether rolling.rank is implemented --- qlib/data/ops.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index cfc1ed9b1bd..fe2ebc9f6d9 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -1148,25 +1148,15 @@ class Rank(Rolling): def __init__(self, feature, N): super(Rank, self).__init__(feature, N, "rank") - major_version, minor_version, *_ = pd.__version__.split(".") - self._load_internal = ( - self._load_internal_pd14 - if int(major_version) > 1 or int(major_version) == 1 and int(minor_version) > 3 - else self._load_internal_pd_below_13 - ) - - def _load_internal_pd14(self, instrument, start_index, end_index, *args): - series = self.feature.load(instrument, start_index, end_index, *args) - if self.N == 0: - series = series.expanding(min_periods=1).rank(pct=True) - else: - series = series.rolling(self.N, min_periods=1).rank(pct=True) - return series # for compatiblity of python 3.7, which doesn't support pandas 1.4.0+ which implements Rolling.rank - def _load_internal_pd_below_13(self, instrument, start_index, end_index, *args): + def _load_internal(self, instrument, start_index, end_index, *args): series = self.feature.load(instrument, start_index, end_index, *args) + rolling_or_expending = series.expanding(min_periods=1) if self.N == 0 else series.rolling(self.N, min_periods=1) + if hasattr(rolling_or_expending, "rank"): + return rolling_or_expending.rank(pct=True) + def rank(x): if np.isnan(x[-1]): return np.nan @@ -1175,11 +1165,7 @@ def rank(x): return np.nan return percentileofscore(x1, x1[-1]) / 100 - if self.N == 0: - series = series.expanding(min_periods=1).apply(rank, raw=True) - else: - series = series.rolling(self.N, min_periods=1).apply(rank, raw=True) - return series + return rolling_or_expending.apply(rank, raw=True) class Count(Rolling):