Skip to content

Commit 5208c8c

Browse files
committed
Price repair improvements
Price repair improvements: - don't attempt repair of empty prices table - random-mixups: fix 0.01x errors, not just 100x - stop zeroes, big-dividends, and 100x errors triggering false split errors
1 parent 279726a commit 5208c8c

File tree

1 file changed

+91
-33
lines changed

1 file changed

+91
-33
lines changed

yfinance/base.py

Lines changed: 91 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,8 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1):
829829

830830
@utils.log_indent_decorator
831831
def _fix_unit_mixups(self, df, interval, tz_exchange, prepost):
832+
if df.empty:
833+
return df
832834
df2 = self._fix_unit_switch(df, interval, tz_exchange)
833835
df3 = self._fix_unit_random_mixups(df2, interval, tz_exchange, prepost)
834836
return df3
@@ -842,6 +844,9 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
842844
# - a sudden switch between $<->cents at some date
843845
# This function fixes the first.
844846

847+
if df.empty:
848+
return df
849+
845850
# Easy to detect and fix, just look for outliers = ~100x local median
846851
logger = utils.get_yf_logger()
847852

@@ -885,7 +890,11 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
885890
ratio = df2_data / median
886891
ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20
887892
f = ratio_rounded == 100
888-
if not f.any():
893+
ratio_rcp = 1.0/ratio
894+
ratio_rcp_rounded = (ratio_rcp / 20).round() * 20 # round ratio to nearest 20
895+
f_rcp = (ratio_rounded == 100) | (ratio_rcp_rounded == 100)
896+
f_either = f | f_rcp
897+
if not f_either.any():
889898
logger.info("price-repair-100x: No sporadic 100x errors")
890899
if "Repaired?" not in df.columns:
891900
df["Repaired?"] = False
@@ -894,7 +903,7 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
894903
# Mark values to send for repair
895904
tag = -1.0
896905
for i in range(len(data_cols)):
897-
fi = f[:, i]
906+
fi = f_either[:, i]
898907
c = data_cols[i]
899908
df2.loc[fi, c] = tag
900909

@@ -906,35 +915,43 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
906915
if n_after > 0:
907916
# This second pass will *crudely* "fix" any remaining errors in High/Low
908917
# simply by ensuring they don't contradict e.g. Low = 100x High.
909-
f = df2_tagged
918+
f = (df2[data_cols].to_numpy() == tag) & f
910919
for i in range(f.shape[0]):
911920
fi = f[i, :]
912921
if not fi.any():
913922
continue
914923
idx = df2.index[i]
915924

916-
c = "Open"
917-
j = data_cols.index(c)
918-
if fi[j]:
919-
df2.loc[idx, c] = df.loc[idx, c] * 0.01
920-
#
921-
c = "Close"
922-
j = data_cols.index(c)
925+
for c in ['Open', 'Close']:
926+
j = data_cols.index(c)
927+
if fi[j]:
928+
df2.loc[idx, c] = df.loc[idx, c] * 0.01
929+
930+
c = "High" ; j = data_cols.index(c)
923931
if fi[j]:
924-
df2.loc[idx, c] = df.loc[idx, c] * 0.01
925-
#
926-
c = "Adj Close"
927-
j = data_cols.index(c)
932+
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max()
933+
934+
c = "Low" ; j = data_cols.index(c)
928935
if fi[j]:
929-
df2.loc[idx, c] = df.loc[idx, c] * 0.01
930-
#
931-
c = "High"
932-
j = data_cols.index(c)
936+
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min()
937+
938+
f_rcp = (df2[data_cols].to_numpy() == tag) & f_rcp
939+
for i in range(f_rcp.shape[0]):
940+
fi = f_rcp[i, :]
941+
if not fi.any():
942+
continue
943+
idx = df2.index[i]
944+
945+
for c in ['Open', 'Close']:
946+
j = data_cols.index(c)
947+
if fi[j]:
948+
df2.loc[idx, c] = df.loc[idx, c] * 100.0
949+
950+
c = "High" ; j = data_cols.index(c)
933951
if fi[j]:
934952
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max()
935-
#
936-
c = "Low"
937-
j = data_cols.index(c)
953+
954+
c = "Low" ; j = data_cols.index(c)
938955
if fi[j]:
939956
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min()
940957

@@ -953,9 +970,9 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
953970
logger.info('price-repair-100x: ' + report_msg)
954971

955972
# Restore original values where repair failed
956-
f = df2_tagged
973+
f_either = df2[data_cols].to_numpy() == tag
957974
for j in range(len(data_cols)):
958-
fj = f[:, j]
975+
fj = f_either[:, j]
959976
if fj.any():
960977
c = data_cols[j]
961978
df2.loc[fj, c] = df.loc[fj, c]
@@ -977,14 +994,6 @@ def _fix_unit_switch(self, df, interval, tz_exchange):
977994
# This function fixes the second.
978995
# Eventually Yahoo fixes but could take them 2 weeks.
979996

980-
# To detect, use 'bad split adjustment' algorithm. But only correct
981-
# if no stock splits in data
982-
983-
f_splits = df['Stock Splits'].to_numpy() != 0.0
984-
if f_splits.any():
985-
utils.get_yf_logger().debug('price-repair-100x: Cannot check for chunked 100x errors because splits present')
986-
return df
987-
988997
return self._fix_prices_sudden_change(df, interval, tz_exchange, 100.0)
989998

990999
@utils.log_indent_decorator
@@ -993,6 +1002,9 @@ def _fix_zeroes(self, df, interval, tz_exchange, prepost):
9931002
# But most times when prices=0 or NaN returned is because no trades.
9941003
# Impossible to distinguish, so only attempt repair if few or rare.
9951004

1005+
if df.empty:
1006+
return df
1007+
9961008
logger = utils.get_yf_logger()
9971009

9981010
if df.shape[0] == 0:
@@ -1101,6 +1113,9 @@ def _fix_missing_div_adjust(self, df, interval, tz_exchange):
11011113
# Easy to detect and correct BUT ONLY IF the data 'df' includes today's dividend.
11021114
# E.g. if fetching historic prices before todays dividend, then cannot fix.
11031115

1116+
if df.empty:
1117+
return df
1118+
11041119
logger = utils.get_yf_logger()
11051120

11061121
if df is None or df.empty:
@@ -1173,6 +1188,9 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange):
11731188
# which direction to reverse adjustment - have to analyse prices and detect.
11741189
# Not difficult.
11751190

1191+
if df.empty:
1192+
return df
1193+
11761194
logger = utils.get_yf_logger()
11771195

11781196
interday = interval in ['1d', '1wk', '1mo', '3mo']
@@ -1198,6 +1216,9 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange):
11981216

11991217
@utils.log_indent_decorator
12001218
def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_volume=False):
1219+
if df.empty:
1220+
return df
1221+
12011222
logger = utils.get_yf_logger()
12021223

12031224
df = df.sort_index(ascending=False)
@@ -1262,11 +1283,25 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v
12621283
# Avoid using 'Low' and 'High'. For multiday intervals, these can be
12631284
# very volatile so reduce ability to detect genuine stock split errors
12641285
_1d_change_x = np.full((n, 2), 1.0)
1265-
price_data = df2[['Open','Close']].replace(0.0, 1.0).to_numpy()
1286+
price_data = df2[['Open','Close']].to_numpy()
1287+
f_zero = price_data == 0.0
12661288
else:
12671289
_1d_change_x = np.full((n, 4), 1.0)
1268-
price_data = df2[OHLC].replace(0.0, 1.0).to_numpy()
1290+
price_data = df2[OHLC].to_numpy()
1291+
f_zero = price_data == 0.0
1292+
if f_zero.any():
1293+
price_data[f_zero] = 1.0
1294+
1295+
# Update: if a VERY large dividend is paid out, then can be mistaken for a 1:2 stock split.
1296+
# Fix = use adjusted prices
1297+
adj = df2['Adj Close'].to_numpy() / df2['Close'].to_numpy()
1298+
for j in range(price_data.shape[1]):
1299+
price_data[:,j] *= adj
1300+
12691301
_1d_change_x[1:] = price_data[1:, ] / price_data[:-1, ]
1302+
f_zero_num_denom = f_zero | np.roll(f_zero, 1, axis=0)
1303+
if f_zero_num_denom.any():
1304+
_1d_change_x[f_zero_num_denom] = 1.0
12701305
if interday and interval != '1d':
12711306
# average change
12721307
_1d_change_minx = np.average(_1d_change_x, axis=1)
@@ -1365,6 +1400,29 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v
13651400
logger.info(f'price-repair-split: No {fix_type}s detected')
13661401
return df
13671402

1403+
# Update: if any 100x changes are soon after a stock split, so could be confused with split error, then abort
1404+
threshold_days = 30
1405+
f_splits = df['Stock Splits'].to_numpy() != 0.0
1406+
if change in [100.0, 0.01] and f_splits.any():
1407+
indices_A = np.where(f_splits)[0]
1408+
indices_B = np.where(f)[0]
1409+
if not len(indices_A) or not len(indices_B):
1410+
return None
1411+
gaps = indices_B[:, None] - indices_A
1412+
# Because data is sorted in DEscending order, need to flip gaps
1413+
gaps *= -1
1414+
f_pos = gaps > 0
1415+
if f_pos.any():
1416+
gap_min = gaps[f_pos].min()
1417+
gap_td = utils._interval_to_timedelta(interval) * gap_min
1418+
if isinstance(gap_td, _dateutil.relativedelta.relativedelta):
1419+
threshold = _dateutil.relativedelta.relativedelta(days=threshold_days)
1420+
else:
1421+
threshold = _datetime.timedelta(days=threshold_days)
1422+
if gap_td < threshold:
1423+
logger.info(f'price-repair-split: 100x changes are too soon after stock split events, aborting')
1424+
return df
1425+
13681426
# if logger.isEnabledFor(logging.DEBUG):
13691427
# df_debug['i'] = list(range(0, df_debug.shape[0]))
13701428
# df_debug['i_rev'] = df_debug.shape[0]-1 - df_debug['i']

0 commit comments

Comments
 (0)