Skip to content

Commit 2167fb3

Browse files
authored
Merge pull request #2139 from ranaroussi/fix/price-repair-missing-some
Prices dividend repair: fixes
2 parents d3ec9c6 + 006649d commit 2167fb3

File tree

9 files changed

+3853
-3443
lines changed

9 files changed

+3853
-3443
lines changed

tests/data/KWS-L-1d-bad-div-fixed.csv

Lines changed: 724 additions & 665 deletions
Large diffs are not rendered by default.

tests/data/KWS-L-1d-bad-div.csv

Lines changed: 724 additions & 665 deletions
Large diffs are not rendered by default.

tests/data/NVT-L-1d-bad-div-fixed.csv

Lines changed: 428 additions & 388 deletions
Large diffs are not rendered by default.

tests/data/NVT-L-1d-bad-div.csv

Lines changed: 293 additions & 253 deletions
Large diffs are not rendered by default.

tests/data/SCR-TO-1d-bad-div-fixed.csv

Lines changed: 460 additions & 426 deletions
Large diffs are not rendered by default.

tests/data/SCR-TO-1d-bad-div.csv

Lines changed: 257 additions & 223 deletions
Large diffs are not rendered by default.

tests/data/SOLB-BR-1d-bad-div-fixed.csv

Lines changed: 586 additions & 523 deletions
Large diffs are not rendered by default.

tests/data/SOLB-BR-1d-bad-div.csv

Lines changed: 339 additions & 276 deletions
Large diffs are not rendered by default.

yfinance/scrapers/history.py

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,7 +1426,7 @@ def _fix_bad_div_adjust(self, df, interval, currency):
14261426
typical_volatility = np.nan
14271427
else:
14281428
diffs = df2['Close'].iloc[start:end-1].to_numpy() - df2['Low'].iloc[start+1:end].to_numpy()
1429-
typical_volatility = np.median(np.abs(diffs))
1429+
typical_volatility = np.mean(np.abs(diffs))
14301430

14311431
possibilities = []
14321432
if (drops==0.0).all() and df2['Volume'].iloc[div_idx]==0:
@@ -1681,10 +1681,6 @@ def cluster_dividends(df, column='div', threshold=7):
16811681
div_status_df.loc[phantom_div_dt, c] = False
16821682
checks.append('phantom')
16831683

1684-
if not div_status_df[checks].any().any():
1685-
# Perfect
1686-
return df
1687-
16881684
# Remove phantoms early
16891685
if 'phantom' in div_status_df.columns:
16901686
f_phantom = div_status_df['phantom']
@@ -1709,6 +1705,29 @@ def cluster_dividends(df, column='div', threshold=7):
17091705
if 'phantom' in checks:
17101706
checks.remove('phantom')
17111707

1708+
if not div_status_df[checks].any().any():
1709+
# Maybe failed to detect a too-small div. If div is ~0.01x of previous and next, then
1710+
# treat as a 0.01x error
1711+
if len(div_status_df) > 1:
1712+
for i in range(0, len(div_status_df)):
1713+
r_pre, r_post = None, None
1714+
if i > 0:
1715+
r_pre = div_status_df['%'].iloc[i-1] / div_status_df['%'].iloc[i]
1716+
if i < (len(div_status_df)-1):
1717+
r_post = div_status_df['%'].iloc[i+1] / div_status_df['%'].iloc[i]
1718+
r_pre = r_pre or r_post
1719+
r_post = r_post or r_pre
1720+
if abs(r_pre-currency_divide)<20 and abs(r_post-currency_divide)<20:
1721+
div_dt = div_status_df.index[i]
1722+
div_status_df.loc[div_dt, 'div_too_small'] = True
1723+
1724+
if not div_status_df[checks].any().any():
1725+
# Perfect
1726+
if df_modified:
1727+
return df2
1728+
else:
1729+
return df
1730+
17121731
# Check if the present div-adjustment contradicts price action
17131732
for i in range(len(div_status_df)):
17141733
div_idx = div_status_df['idx'].iloc[i]
@@ -1789,7 +1808,8 @@ def cluster_dividends(df, column='div', threshold=7):
17891808
elif adjDelta_drop > 0.39*adjDiv:
17901809
# Still true that applied adjustment exceeds price action,
17911810
# just not clear what solution is (if any).
1792-
div_adj_exceeds_prices = True
1811+
if (x['Adj']<1.0).any():
1812+
div_adj_exceeds_prices = True
17931813
break
17941814

17951815
# Can prune the space:
@@ -1843,22 +1863,6 @@ def cluster_dividends(df, column='div', threshold=7):
18431863

18441864
checks += ['adj_exceeds_prices', 'div_date_wrong']
18451865

1846-
if not div_status_df[checks].any().any():
1847-
# Maybe failed to detect a too-small div. If div is ~0.01x of previous and next, then
1848-
# treat as a 0.01x error
1849-
if len(div_status_df) > 1:
1850-
for i in range(0, len(div_status_df)):
1851-
r_pre, r_post = None, None
1852-
if i > 0:
1853-
r_pre = div_status_df['%'].iloc[i-1] / div_status_df['%'].iloc[i]
1854-
if i < (len(div_status_df)-1):
1855-
r_post = div_status_df['%'].iloc[i+1] / div_status_df['%'].iloc[i]
1856-
r_pre = r_pre or r_post
1857-
r_post = r_post or r_pre
1858-
if abs(r_pre-currency_divide)<20 and abs(r_post-currency_divide)<20:
1859-
div_dt = div_status_df.index[i]
1860-
div_status_df.loc[div_dt, 'div_too_small'] = True
1861-
18621866
for c in checks:
18631867
if not div_status_df[c].any():
18641868
div_status_df = div_status_df.drop(c, axis=1)
@@ -1887,11 +1891,16 @@ def cluster_dividends(df, column='div', threshold=7):
18871891
div_pcts['avg yr yield'] = div_pcts['%'] / div_pcts['period']
18881892

18891893
for c in checks:
1894+
if not cluster[c].to_numpy().any():
1895+
cluster = cluster.drop(c, axis=1)
1896+
cluster_checks = [c for c in checks if c in cluster.columns]
1897+
1898+
for c in cluster_checks:
18901899
f_fail = cluster[c].to_numpy()
18911900
n_fail = np.sum(f_fail)
18921901
if n_fail in [0, n]:
18931902
continue
1894-
pct_fail = np.sum(f_fail) / n
1903+
pct_fail = n_fail / n
18951904
if c == 'div_too_big':
18961905
true_threshold = 1.0
18971906
fals_threshold = 0.2
@@ -1900,7 +1909,16 @@ def cluster_dividends(df, column='div', threshold=7):
19001909
continue
19011910

19021911
if 'adj_exceeds_prices' in cluster.columns and (cluster[c] == (cluster[c] & cluster['adj_exceeds_prices'])).all():
1903-
# More likely that true-positive. Maybe the div never happened
1912+
# Treat div_too_big=False as false positives IFF adj_exceeds_prices=true AND
1913+
# true ratio above (lowered) threshold.
1914+
true_threshold = 0.5
1915+
f_adj_exceeds_prices = cluster['adj_exceeds_prices'].to_numpy()
1916+
n = np.sum(f_adj_exceeds_prices)
1917+
n_fail = np.sum(f_fail[f_adj_exceeds_prices])
1918+
pct_fail = n_fail / n
1919+
if pct_fail > true_threshold:
1920+
f = fc & div_status_df['adj_exceeds_prices'].to_numpy()
1921+
div_status_df.loc[f, c] = True
19041922
continue
19051923

19061924
if 'div_exceeds_adj' in cluster.columns and cluster['div_exceeds_adj'].all():

0 commit comments

Comments
 (0)