@@ -85,9 +85,9 @@ def cdsxmatch(
8585 Return Pandas DataFrame with a new single column
8686 containing comma-separated values of extra-columns.
8787 If the object is not found in Simbad, the type is
88- marked as Unknown . In the case several objects match
88+ marked as null . In the case several objects match
8989 the centroid of the alert, only the closest is returned.
90- If the request Failed (no match at all), return Column of Fail .
90+ If the request Failed (no match at all), return Column of Fails .
9191
9292 Examples
9393 --------
@@ -110,16 +110,16 @@ def cdsxmatch(
110110
111111 Test the processor by adding a new column with the result of the xmatch
112112 >>> df = df.withColumn(
113- ... 'cdsxmatch ',
113+ ... 'simbad_otype ',
114114 ... cdsxmatch(
115115 ... df['id'], df['ra'], df['dec'],
116- ... F.lit(1.0), F.lit('simbad'), F.lit('main_type ')))
116+ ... F.lit(1.0), F.lit('simbad'), F.lit('otype ')))
117117 >>> df.show() # doctest: +NORMALIZE_WHITESPACE
118118 +---+----------+-----------+------------+
119- | id| ra| dec| cdsxmatch |
119+ | id| ra| dec|simbad_otype |
120120 +---+----------+-----------+------------+
121- | a|26.8566983|-26.9677112|LongPeriodV *|
122- | b| 26.24497|-26.7569436| Star |
121+ | a|26.8566983|-26.9677112| LP *|
122+ | b| 26.24497|-26.7569436| * |
123123 +---+----------+-----------+------------+
124124 <BLANKLINE>
125125 """
@@ -149,15 +149,18 @@ def cdsxmatch(
149149 files = {"cat1" : table },
150150 )
151151
152+ col_list = cols .to_numpy ()[0 ].split ("," )
153+
152154 if r .status_code != 200 :
153- names = ["Fail {}" .format (r .status_code )] * len (diaSourceId )
155+ msg = "Fail {}" .format (r .status_code )
156+ names = ["," .join ([msg ] * len (col_list ))] * len (diaSourceId )
154157 return pd .Series (names )
155158 else :
156- cols = cols .to_numpy ()[0 ].split ("," )
157159 pdf = pd .read_csv (io .BytesIO (r .content ))
158160
159161 if pdf .empty :
160- name = "," .join (["Unknown" ] * len (cols ))
162+ # null values
163+ name = "," .join ([None ] * len (col_list ))
161164 names = [name ] * len (diaSourceId )
162165 return pd .Series (names )
163166
@@ -171,27 +174,22 @@ def cdsxmatch(
171174
172175 pdf_out = pdf_in .join (pdf_nodedup )
173176
174- # only for SIMBAD as we use `main_type` for our classification
175- if "main_type" in pdf_out .columns :
176- pdf_out ["main_type" ] = pdf_out ["main_type" ].replace (np .nan , "Unknown" )
177-
178- if len (cols ) > 1 :
177+ if len (col_list ) > 1 :
179178 # Concatenate all columns in one
180179 # use comma-separated values
181- cols = [i .strip () for i in cols ]
182- pdf_out = pdf_out [cols ]
180+ col_list = [i .strip () for i in col_list ]
181+ pdf_out = pdf_out [col_list ]
183182 pdf_out ["concat_cols" ] = pdf_out .apply (
184183 lambda x : "," .join (x .astype (str ).to_numpy ().tolist ()), axis = 1
185184 )
186185 return pdf_out ["concat_cols" ]
187- elif len (cols ) == 1 :
186+ elif len (col_list ) == 1 :
188187 # single column to return
189- return pdf_out [cols [0 ]].astype (str )
188+ return pdf_out [col_list [0 ]].astype (str )
190189
191190 except (ConnectionError , TimeoutError , ValueError ) as ce :
192191 logging .warning ("XMATCH failed " + repr (ce ))
193- ncols = len (cols .to_numpy ()[0 ].split ("," ))
194- name = "," .join (["Fail" ] * ncols )
192+ name = "," .join (["Fail" ] * len (col_list ))
195193 names = [name ] * len (diaSourceId )
196194 return pd .Series (names )
197195
@@ -206,6 +204,11 @@ def xmatch_cds(
206204):
207205 """Cross-match Fink data from a Spark DataFrame with a catalog in CDS
208206
207+ Notes
208+ -----
209+ To check available columns and their name:
210+ http://cdsxmatch.u-strasbg.fr/xmatch/api/v1/sync/tables?action=getColList&tabName=I/355/gaiadr3&RESPONSEFORMAT=json
211+
209212 Parameters
210213 ----------
211214 df: Spark DataFrame
@@ -220,9 +223,11 @@ def xmatch_cds(
220223 Default is ["diaSource.diaSourceId", "diaSource.ra", "diaSource.dec"]
221224 cols_out: list of str
222225 N column names to get from the external catalog.
226+ If None, assume ["otype"] for simbad.
223227 types: list of str
224228 N types of columns from the external catalog.
225- Should be SQL syntax (str=string, etc.)
229+ Should be SQL syntax (str=string, etc.).
230+ If None, return ["str"] for simbad.
226231
227232 Returns
228233 -------
@@ -235,7 +240,7 @@ def xmatch_cds(
235240
236241 # Simbad
237242 >>> df_simbad = xmatch_cds(df)
238- >>> 'cdsxmatch ' in df_simbad.columns
243+ >>> 'simbad_otype ' in df_simbad.columns
239244 True
240245
241246 # Gaia
@@ -245,7 +250,7 @@ def xmatch_cds(
245250 ... catalogname='vizier:I/355/gaiadr3',
246251 ... cols_out=['DR3Name', 'Plx', 'e_Plx'],
247252 ... types=['string', 'float', 'float'])
248- >>> 'Plx ' in df_gaia.columns
253+ >>> 'vizier:I/355/gaiadr3_Plx ' in df_gaia.columns
249254 True
250255
251256 # VSX
@@ -255,7 +260,7 @@ def xmatch_cds(
255260 ... distmaxarcsec=1.5,
256261 ... cols_out=['Type'],
257262 ... types=['string'])
258- >>> 'Type ' in df_vsx.columns
263+ >>> 'vizier:B/vsx/vsx_Type ' in df_vsx.columns
259264 True
260265
261266 # SPICY
@@ -265,13 +270,13 @@ def xmatch_cds(
265270 ... distmaxarcsec=1.2,
266271 ... cols_out=['SPICY', 'class'],
267272 ... types=['int', 'string'])
268- >>> 'SPICY ' in df_spicy.columns
273+ >>> 'vizier:J/ApJS/254/33/table1_SPICY ' in df_spicy.columns
269274 True
270275 """
271276 if cols_in is None :
272277 cols_in = ["diaSource.diaSourceId" , "diaSource.ra" , "diaSource.dec" ]
273278 if cols_out is None :
274- cols_out = ["main_type " ]
279+ cols_out = ["otype " ]
275280 if types is None :
276281 types = ["string" ]
277282
@@ -285,20 +290,15 @@ def xmatch_cds(
285290 F .lit (catalogname ),
286291 F .lit ("," .join (cols_out )),
287292 ),
288- ). withColumn ( "xmatch_split" , F . split ( "xmatch" , "," ))
293+ )
289294
290295 for index , col_ , type_ in zip (range (len (cols_out )), cols_out , types ):
291296 df_out = df_out .withColumn (
292- col_ , F .col ("xmatch_split" ).getItem (index ).astype (type_ )
297+ "{}_{}" .format (catalogname , col_ ),
298+ F .split ("xmatch" , "," ).getItem (index ).astype (type_ ),
293299 )
294300
295- df_out = df_out .drop ("xmatch" , "xmatch_split" )
296-
297- # Keep compatibility with previous definitions
298- if "main_type" in df_out .columns :
299- # remove previous declaration if any
300- df_out = df_out .drop ("cdsxmatch" )
301- df_out = df_out .withColumnRenamed ("main_type" , "cdsxmatch" )
301+ df_out = df_out .drop ("xmatch" )
302302
303303 return df_out
304304
@@ -315,7 +315,7 @@ def xmatch_tns(df, distmaxarcsec=1.5, tns_raw_output=""):
315315 tns_raw_output: str, optional
316316 Folder that contains raw TNS catalog. Inside, it is expected
317317 to find the file `tns_raw.parquet` downloaded using
318- `fink-broker/bin/download_tns.py`. Default is "" , in
318+ `fink-broker/bin/download_tns.py`. Default is None , in
319319 which case the catalog will be downloaded. Beware that
320320 to download the catalog, you need to set environment variables:
321321 - TNS_API_MARKER: path to the TNS API marker (tns_marker.txt)
@@ -333,11 +333,11 @@ def xmatch_tns(df, distmaxarcsec=1.5, tns_raw_output=""):
333333 >>> curdir = os.path.dirname(os.path.abspath(__file__))
334334 >>> path = curdir + '/data/catalogs'
335335 >>> df_tns = xmatch_tns(df, tns_raw_output=path)
336- >>> 'tns ' in df_tns.columns
336+ >>> 'tns_type ' in df_tns.columns
337337 True
338338
339- >>> df_tns.filter(df_tns["tns"] != "" ).count()
340- 0
339+ >>> df_tns.filter(df_tns["tns_type"].isNull() ).count()
340+ 50
341341
342342 """
343343 if tns_raw_output == "" :
@@ -351,8 +351,10 @@ def xmatch_tns(df, distmaxarcsec=1.5, tns_raw_output=""):
351351 _LOG .warning (
352352 "TNS_API_MARKER and TNS_API_KEY are not defined as env var in the master."
353353 )
354- _LOG .warning ("Skipping crossmatch with TNS." )
355- df = df .withColumn ("tns" , F .lit ("" ))
354+ _LOG .warning (
355+ "Skipping crossmatch with TNS. Creating a tns_type columns with null values."
356+ )
357+ df = df .withColumn ("tns_type" , F .lit (None ))
356358 return df
357359 else :
358360 pdf_tns = pd .read_parquet (os .path .join (tns_raw_output , "tns_raw.parquet" ))
@@ -380,7 +382,7 @@ def crossmatch_with_tns(diaSourceId, ra, dec):
380382 Returns
381383 -------
382384 to_return: pd.Series of str
383- TNS type for the alert. `Unknown` if no match.
385+ TNS type for the alert. null if no match.
384386 """
385387 pdf = pdf_tns_filt_b .value
386388 ra2 , dec2 , type2 = pdf ["ra" ], pdf ["declination" ], pdf ["type" ]
@@ -410,21 +412,21 @@ def crossmatch_with_tns(diaSourceId, ra, dec):
410412 # set separation length
411413 sep_constraint2 = d2d2 .degree < distmaxarcsec / 3600.0
412414
413- sub_pdf ["TNS" ] = ["" ] * len (sub_pdf )
415+ sub_pdf ["TNS" ] = [None ] * len (sub_pdf )
414416 sub_pdf ["TNS" ][sep_constraint2 ] = type2 .to_numpy ()[idx2 [sep_constraint2 ]]
415417
416418 # Here we take the first match
417419 # What if there are many? AT & SN?
418420 to_return = diaSourceId .apply (
419- lambda x : ""
421+ lambda x : None
420422 if x not in sub_pdf ["diaSourceId" ].to_numpy ()
421423 else sub_pdf ["TNS" ][sub_pdf ["diaSourceId" ] == x ].to_numpy ()[0 ]
422424 )
423425
424426 return to_return
425427
426428 df = df .withColumn (
427- "tns " ,
429+ "tns_type " ,
428430 crossmatch_with_tns (
429431 df ["diaSource.diaSourceId" ], df ["diaSource.ra" ], df ["diaSource.dec" ]
430432 ),
@@ -460,7 +462,7 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
460462 Returns
461463 -------
462464 type: str
463- Object type from the catalog. `Unknown` if no match.
465+ Object type from the catalog. null if no match.
464466
465467 Examples
466468 --------
@@ -490,28 +492,28 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
490492 ... 'gcvs',
491493 ... crossmatch_other_catalog(df['id'], df['ra'], df['dec'], lit('gcvs'))
492494 ... ).show() # doctest: +NORMALIZE_WHITESPACE
493- +---+-----------+-----------+------- +
494- | id| ra| dec| gcvs|
495- +---+-----------+-----------+------- +
496- | 1| 26.8566983|-26.9677112|Unknown |
497- | 2|101.3520545| 24.5421872| RR|
498- | 3| 0.3126| 47.6859|Unknown |
499- | 4| 0.31820833|29.59277778|Unknown |
500- +---+-----------+-----------+------- +
495+ +---+-----------+-----------+----+
496+ | id| ra| dec|gcvs|
497+ +---+-----------+-----------+----+
498+ | 1| 26.8566983|-26.9677112|null |
499+ | 2|101.3520545| 24.5421872| RR|
500+ | 3| 0.3126| 47.6859|null |
501+ | 4| 0.31820833|29.59277778|null |
502+ +---+-----------+-----------+----+
501503 <BLANKLINE>
502504
503505 >>> df.withColumn(
504506 ... 'vsx',
505507 ... crossmatch_other_catalog(df['id'], df['ra'], df['dec'], lit('vsx'))
506508 ... ).show() # doctest: +NORMALIZE_WHITESPACE
507- +---+-----------+-----------+------- +
508- | id| ra| dec| vsx|
509- +---+-----------+-----------+------- +
510- | 1| 26.8566983|-26.9677112| MISC|
511- | 2|101.3520545| 24.5421872| RRAB|
512- | 3| 0.3126| 47.6859|Unknown |
513- | 4| 0.31820833|29.59277778|Unknown |
514- +---+-----------+-----------+------- +
509+ +---+-----------+-----------+----+
510+ | id| ra| dec| vsx|
511+ +---+-----------+-----------+----+
512+ | 1| 26.8566983|-26.9677112|MISC|
513+ | 2|101.3520545| 24.5421872|RRAB|
514+ | 3| 0.3126| 47.6859|null |
515+ | 4| 0.31820833|29.59277778|null |
516+ +---+-----------+-----------+----+
515517 <BLANKLINE>
516518
517519 >>> df.withColumn(
@@ -521,9 +523,9 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
521523 +---+-----------+-----------+--------------------+
522524 | id| ra| dec| 3hsp|
523525 +---+-----------+-----------+--------------------+
524- | 1| 26.8566983|-26.9677112| Unknown |
525- | 2|101.3520545| 24.5421872| Unknown |
526- | 3| 0.3126| 47.6859| Unknown |
526+ | 1| 26.8566983|-26.9677112| null |
527+ | 2|101.3520545| 24.5421872| null |
528+ | 3| 0.3126| 47.6859| null |
527529 | 4| 0.31820833|29.59277778|3HSPJ000116.4+293534|
528530 +---+-----------+-----------+--------------------+
529531 <BLANKLINE>
@@ -535,10 +537,10 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
535537 +---+-----------+-----------+-----------------+
536538 | id| ra| dec| 4lac|
537539 +---+-----------+-----------+-----------------+
538- | 1| 26.8566983|-26.9677112| Unknown |
539- | 2|101.3520545| 24.5421872| Unknown |
540+ | 1| 26.8566983|-26.9677112| null |
541+ | 2|101.3520545| 24.5421872| null |
540542 | 3| 0.3126| 47.6859|4FGL J0001.2+4741|
541- | 4| 0.31820833|29.59277778| Unknown |
543+ | 4| 0.31820833|29.59277778| null |
542544 +---+-----------+-----------+-----------------+
543545 <BLANKLINE>
544546 """
@@ -578,7 +580,7 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
578580 pdf , catalog_rubin , catalog_other , radius_arcsec = radius_arcsec
579581 )
580582
581- pdf_merge ["Type" ] = "Unknown"
583+ pdf_merge ["Type" ] = None
582584 pdf_merge .loc [mask , "Type" ] = [
583585 str (i ).strip () for i in type2 .astype (str ).to_numpy ()[idx2 ]
584586 ]
@@ -605,7 +607,7 @@ def crossmatch_mangrove(diaSourceId, ra, dec, radius_arcsec=None):
605607 Returns
606608 -------
607609 type: str
608- Object type from the catalog. `Unknown` if no match.
610+ Object type from the catalog. null if no match.
609611
610612 Examples
611613 --------
@@ -637,9 +639,9 @@ def crossmatch_mangrove(diaSourceId, ra, dec, radius_arcsec=None):
637639 ... ).toPandas() # doctest: +NORMALIZE_WHITESPACE
638640 id ra dec mangrove
639641 0 1 198.955536 42.029289 {'HyperLEDA_name': 'NGC5055', '2MASS_name': '1...
640- 1 2 101.352054 24.542187 {'HyperLEDA_name': ' None' , '2MASS_name': ' None...
641- 2 3 0.312600 47.685900 {'HyperLEDA_name': ' None' , '2MASS_name': ' None...
642- 3 4 0.318208 29.592778 {'HyperLEDA_name': ' None' , '2MASS_name': ' None...
642+ 1 2 101.352054 24.542187 {'HyperLEDA_name': None, '2MASS_name': None, ' ...
643+ 2 3 0.312600 47.685900 {'HyperLEDA_name': None, '2MASS_name': None, ' ...
644+ 3 4 0.318208 29.592778 {'HyperLEDA_name': None, '2MASS_name': None, ' ...
643645 """
644646 pdf = pd .DataFrame ({
645647 "ra" : ra .to_numpy (),
@@ -666,7 +668,7 @@ def crossmatch_mangrove(diaSourceId, ra, dec, radius_arcsec=None):
666668 pdf , catalog_rubin , catalog_other , radius_arcsec = radius_arcsec
667669 )
668670
669- default = {name : " None" for name in MANGROVE_COLS }
671+ default = {name : None for name in MANGROVE_COLS }
670672 pdf_merge ["Type" ] = [default for i in range (len (pdf_merge ))]
671673 pdf_merge .loc [mask , "Type" ] = [payload [i ] for i in idx2 ]
672674
0 commit comments