@@ -829,6 +829,8 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1):
829829
830830 @utils .log_indent_decorator
831831 def _fix_unit_mixups (self , df , interval , tz_exchange , prepost ):
832+ if df .empty :
833+ return df
832834 df2 = self ._fix_unit_switch (df , interval , tz_exchange )
833835 df3 = self ._fix_unit_random_mixups (df2 , interval , tz_exchange , prepost )
834836 return df3
@@ -842,6 +844,9 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
842844 # - a sudden switch between $<->cents at some date
843845 # This function fixes the first.
844846
847+ if df .empty :
848+ return df
849+
845850 # Easy to detect and fix, just look for outliers = ~100x local median
846851 logger = utils .get_yf_logger ()
847852
@@ -885,7 +890,11 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
885890 ratio = df2_data / median
886891 ratio_rounded = (ratio / 20 ).round () * 20 # round ratio to nearest 20
887892 f = ratio_rounded == 100
888- if not f .any ():
893+ ratio_rcp = 1.0 / ratio
894+ ratio_rcp_rounded = (ratio_rcp / 20 ).round () * 20 # round ratio to nearest 20
895+ f_rcp = (ratio_rounded == 100 ) | (ratio_rcp_rounded == 100 )
896+ f_either = f | f_rcp
897+ if not f_either .any ():
889898 logger .info ("price-repair-100x: No sporadic 100x errors" )
890899 if "Repaired?" not in df .columns :
891900 df ["Repaired?" ] = False
@@ -894,7 +903,7 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
894903 # Mark values to send for repair
895904 tag = - 1.0
896905 for i in range (len (data_cols )):
897- fi = f [:, i ]
906+ fi = f_either [:, i ]
898907 c = data_cols [i ]
899908 df2 .loc [fi , c ] = tag
900909
@@ -906,35 +915,43 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
906915 if n_after > 0 :
907916 # This second pass will *crudely* "fix" any remaining errors in High/Low
908917 # simply by ensuring they don't contradict e.g. Low = 100x High.
909- f = df2_tagged
918+ f = ( df2 [ data_cols ]. to_numpy () == tag ) & f
910919 for i in range (f .shape [0 ]):
911920 fi = f [i , :]
912921 if not fi .any ():
913922 continue
914923 idx = df2 .index [i ]
915924
916- c = "Open"
917- j = data_cols .index (c )
918- if fi [j ]:
919- df2 .loc [idx , c ] = df .loc [idx , c ] * 0.01
920- #
921- c = "Close"
922- j = data_cols .index (c )
925+ for c in ['Open' , 'Close' ]:
926+ j = data_cols .index (c )
927+ if fi [j ]:
928+ df2 .loc [idx , c ] = df .loc [idx , c ] * 0.01
929+
930+ c = "High" ; j = data_cols .index (c )
923931 if fi [j ]:
924- df2 .loc [idx , c ] = df .loc [idx , c ] * 0.01
925- #
926- c = "Adj Close"
927- j = data_cols .index (c )
932+ df2 .loc [idx , c ] = df2 .loc [idx , ["Open" , "Close" ]].max ()
933+
934+ c = "Low" ; j = data_cols .index (c )
928935 if fi [j ]:
929- df2 .loc [idx , c ] = df .loc [idx , c ] * 0.01
930- #
931- c = "High"
932- j = data_cols .index (c )
936+ df2 .loc [idx , c ] = df2 .loc [idx , ["Open" , "Close" ]].min ()
937+
938+ f_rcp = (df2 [data_cols ].to_numpy () == tag ) & f_rcp
939+ for i in range (f_rcp .shape [0 ]):
940+ fi = f_rcp [i , :]
941+ if not fi .any ():
942+ continue
943+ idx = df2 .index [i ]
944+
945+ for c in ['Open' , 'Close' ]:
946+ j = data_cols .index (c )
947+ if fi [j ]:
948+ df2 .loc [idx , c ] = df .loc [idx , c ] * 100.0
949+
950+ c = "High" ; j = data_cols .index (c )
933951 if fi [j ]:
934952 df2 .loc [idx , c ] = df2 .loc [idx , ["Open" , "Close" ]].max ()
935- #
936- c = "Low"
937- j = data_cols .index (c )
953+
954+ c = "Low" ; j = data_cols .index (c )
938955 if fi [j ]:
939956 df2 .loc [idx , c ] = df2 .loc [idx , ["Open" , "Close" ]].min ()
940957
@@ -953,9 +970,9 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
953970 logger .info ('price-repair-100x: ' + report_msg )
954971
955972 # Restore original values where repair failed
956- f = df2_tagged
973+ f_either = df2 [ data_cols ]. to_numpy () == tag
957974 for j in range (len (data_cols )):
958- fj = f [:, j ]
975+ fj = f_either [:, j ]
959976 if fj .any ():
960977 c = data_cols [j ]
961978 df2 .loc [fj , c ] = df .loc [fj , c ]
@@ -977,14 +994,6 @@ def _fix_unit_switch(self, df, interval, tz_exchange):
977994 # This function fixes the second.
978995 # Eventually Yahoo fixes but could take them 2 weeks.
979996
980- # To detect, use 'bad split adjustment' algorithm. But only correct
981- # if no stock splits in data
982-
983- f_splits = df ['Stock Splits' ].to_numpy () != 0.0
984- if f_splits .any ():
985- utils .get_yf_logger ().debug ('price-repair-100x: Cannot check for chunked 100x errors because splits present' )
986- return df
987-
988997 return self ._fix_prices_sudden_change (df , interval , tz_exchange , 100.0 )
989998
990999 @utils .log_indent_decorator
@@ -993,6 +1002,9 @@ def _fix_zeroes(self, df, interval, tz_exchange, prepost):
9931002 # But most times when prices=0 or NaN returned is because no trades.
9941003 # Impossible to distinguish, so only attempt repair if few or rare.
9951004
1005+ if df .empty :
1006+ return df
1007+
9961008 logger = utils .get_yf_logger ()
9971009
9981010 if df .shape [0 ] == 0 :
@@ -1101,6 +1113,9 @@ def _fix_missing_div_adjust(self, df, interval, tz_exchange):
11011113 # Easy to detect and correct BUT ONLY IF the data 'df' includes today's dividend.
11021114 # E.g. if fetching historic prices before todays dividend, then cannot fix.
11031115
1116+ if df .empty :
1117+ return df
1118+
11041119 logger = utils .get_yf_logger ()
11051120
11061121 if df is None or df .empty :
@@ -1173,6 +1188,9 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange):
11731188 # which direction to reverse adjustment - have to analyse prices and detect.
11741189 # Not difficult.
11751190
1191+ if df .empty :
1192+ return df
1193+
11761194 logger = utils .get_yf_logger ()
11771195
11781196 interday = interval in ['1d' , '1wk' , '1mo' , '3mo' ]
@@ -1198,6 +1216,9 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange):
11981216
11991217 @utils .log_indent_decorator
12001218 def _fix_prices_sudden_change (self , df , interval , tz_exchange , change , correct_volume = False ):
1219+ if df .empty :
1220+ return df
1221+
12011222 logger = utils .get_yf_logger ()
12021223
12031224 df = df .sort_index (ascending = False )
@@ -1262,11 +1283,25 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v
12621283 # Avoid using 'Low' and 'High'. For multiday intervals, these can be
12631284 # very volatile so reduce ability to detect genuine stock split errors
12641285 _1d_change_x = np .full ((n , 2 ), 1.0 )
1265- price_data = df2 [['Open' ,'Close' ]].replace (0.0 , 1.0 ).to_numpy ()
1286+ price_data = df2 [['Open' ,'Close' ]].to_numpy ()
1287+ f_zero = price_data == 0.0
12661288 else :
12671289 _1d_change_x = np .full ((n , 4 ), 1.0 )
1268- price_data = df2 [OHLC ].replace (0.0 , 1.0 ).to_numpy ()
1290+ price_data = df2 [OHLC ].to_numpy ()
1291+ f_zero = price_data == 0.0
1292+ if f_zero .any ():
1293+ price_data [f_zero ] = 1.0
1294+
1295+ # Update: if a VERY large dividend is paid out, then can be mistaken for a 1:2 stock split.
1296+ # Fix = use adjusted prices
1297+ adj = df2 ['Adj Close' ].to_numpy () / df2 ['Close' ].to_numpy ()
1298+ for j in range (price_data .shape [1 ]):
1299+ price_data [:,j ] *= adj
1300+
12691301 _1d_change_x [1 :] = price_data [1 :, ] / price_data [:- 1 , ]
1302+ f_zero_num_denom = f_zero | np .roll (f_zero , 1 , axis = 0 )
1303+ if f_zero_num_denom .any ():
1304+ _1d_change_x [f_zero_num_denom ] = 1.0
12701305 if interday and interval != '1d' :
12711306 # average change
12721307 _1d_change_minx = np .average (_1d_change_x , axis = 1 )
@@ -1365,6 +1400,29 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v
13651400 logger .info (f'price-repair-split: No { fix_type } s detected' )
13661401 return df
13671402
1403+ # Update: if any 100x changes are soon after a stock split, so could be confused with split error, then abort
1404+ threshold_days = 30
1405+ f_splits = df ['Stock Splits' ].to_numpy () != 0.0
1406+ if change in [100.0 , 0.01 ] and f_splits .any ():
1407+ indices_A = np .where (f_splits )[0 ]
1408+ indices_B = np .where (f )[0 ]
1409+ if not len (indices_A ) or not len (indices_B ):
1410+ return None
1411+ gaps = indices_B [:, None ] - indices_A
1412+ # Because data is sorted in DEscending order, need to flip gaps
1413+ gaps *= - 1
1414+ f_pos = gaps > 0
1415+ if f_pos .any ():
1416+ gap_min = gaps [f_pos ].min ()
1417+ gap_td = utils ._interval_to_timedelta (interval ) * gap_min
1418+ if isinstance (gap_td , _dateutil .relativedelta .relativedelta ):
1419+ threshold = _dateutil .relativedelta .relativedelta (days = threshold_days )
1420+ else :
1421+ threshold = _datetime .timedelta (days = threshold_days )
1422+ if gap_td < threshold :
1423+ logger .info (f'price-repair-split: 100x changes are too soon after stock split events, aborting' )
1424+ return df
1425+
13681426 # if logger.isEnabledFor(logging.DEBUG):
13691427 # df_debug['i'] = list(range(0, df_debug.shape[0]))
13701428 # df_debug['i_rev'] = df_debug.shape[0]-1 - df_debug['i']
0 commit comments