Skip to content

Commit 9a02e3a

Browse files
author
Etienne Russeil
committed
2 parents 0585b0d + ab8faad commit 9a02e3a

File tree

5 files changed

+141
-117
lines changed

5 files changed

+141
-117
lines changed

.github/workflows/linter.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,5 @@ jobs:
3030
ruff format --preview --check fink_science/*.py
3131
- name: Science modules
3232
run: |
33-
ruff check --preview --statistics fink_science/*/*.py
34-
ruff format --preview --check fink_science/*/*.py
33+
ruff check --preview --statistics fink_science/*/*/*.py
34+
ruff format --preview --check fink_science/*/*/*.py

fink_science/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15-
__version__ = "8.16.0"
15+
__version__ = "8.17.0"

fink_science/rubin/xmatch/processor.py

Lines changed: 77 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ def cdsxmatch(
8585
Return Pandas DataFrame with a new single column
8686
containing comma-separated values of extra-columns.
8787
If the object is not found in Simbad, the type is
88-
marked as Unknown. In the case several objects match
88+
marked as null. In the case several objects match
8989
the centroid of the alert, only the closest is returned.
90-
If the request Failed (no match at all), return Column of Fail.
90+
If the request Failed (no match at all), return Column of Fails.
9191
9292
Examples
9393
--------
@@ -110,16 +110,16 @@ def cdsxmatch(
110110
111111
Test the processor by adding a new column with the result of the xmatch
112112
>>> df = df.withColumn(
113-
... 'cdsxmatch',
113+
... 'simbad_otype',
114114
... cdsxmatch(
115115
... df['id'], df['ra'], df['dec'],
116-
... F.lit(1.0), F.lit('simbad'), F.lit('main_type')))
116+
... F.lit(1.0), F.lit('simbad'), F.lit('otype')))
117117
>>> df.show() # doctest: +NORMALIZE_WHITESPACE
118118
+---+----------+-----------+------------+
119-
| id| ra| dec| cdsxmatch|
119+
| id| ra| dec|simbad_otype|
120120
+---+----------+-----------+------------+
121-
| a|26.8566983|-26.9677112|LongPeriodV*|
122-
| b| 26.24497|-26.7569436| Star|
121+
| a|26.8566983|-26.9677112| LP*|
122+
| b| 26.24497|-26.7569436| *|
123123
+---+----------+-----------+------------+
124124
<BLANKLINE>
125125
"""
@@ -149,15 +149,18 @@ def cdsxmatch(
149149
files={"cat1": table},
150150
)
151151

152+
col_list = cols.to_numpy()[0].split(",")
153+
152154
if r.status_code != 200:
153-
names = ["Fail {}".format(r.status_code)] * len(diaSourceId)
155+
msg = "Fail {}".format(r.status_code)
156+
names = [",".join([msg] * len(col_list))] * len(diaSourceId)
154157
return pd.Series(names)
155158
else:
156-
cols = cols.to_numpy()[0].split(",")
157159
pdf = pd.read_csv(io.BytesIO(r.content))
158160

159161
if pdf.empty:
160-
name = ",".join(["Unknown"] * len(cols))
162+
# null values
163+
name = ",".join([None] * len(col_list))
161164
names = [name] * len(diaSourceId)
162165
return pd.Series(names)
163166

@@ -171,27 +174,22 @@ def cdsxmatch(
171174

172175
pdf_out = pdf_in.join(pdf_nodedup)
173176

174-
# only for SIMBAD as we use `main_type` for our classification
175-
if "main_type" in pdf_out.columns:
176-
pdf_out["main_type"] = pdf_out["main_type"].replace(np.nan, "Unknown")
177-
178-
if len(cols) > 1:
177+
if len(col_list) > 1:
179178
# Concatenate all columns in one
180179
# use comma-separated values
181-
cols = [i.strip() for i in cols]
182-
pdf_out = pdf_out[cols]
180+
col_list = [i.strip() for i in col_list]
181+
pdf_out = pdf_out[col_list]
183182
pdf_out["concat_cols"] = pdf_out.apply(
184183
lambda x: ",".join(x.astype(str).to_numpy().tolist()), axis=1
185184
)
186185
return pdf_out["concat_cols"]
187-
elif len(cols) == 1:
186+
elif len(col_list) == 1:
188187
# single column to return
189-
return pdf_out[cols[0]].astype(str)
188+
return pdf_out[col_list[0]].astype(str)
190189

191190
except (ConnectionError, TimeoutError, ValueError) as ce:
192191
logging.warning("XMATCH failed " + repr(ce))
193-
ncols = len(cols.to_numpy()[0].split(","))
194-
name = ",".join(["Fail"] * ncols)
192+
name = ",".join(["Fail"] * len(col_list))
195193
names = [name] * len(diaSourceId)
196194
return pd.Series(names)
197195

@@ -206,6 +204,11 @@ def xmatch_cds(
206204
):
207205
"""Cross-match Fink data from a Spark DataFrame with a catalog in CDS
208206
207+
Notes
208+
-----
209+
To check available columns and their name:
210+
http://cdsxmatch.u-strasbg.fr/xmatch/api/v1/sync/tables?action=getColList&tabName=I/355/gaiadr3&RESPONSEFORMAT=json
211+
209212
Parameters
210213
----------
211214
df: Spark DataFrame
@@ -220,9 +223,11 @@ def xmatch_cds(
220223
Default is ["diaSource.diaSourceId", "diaSource.ra", "diaSource.dec"]
221224
cols_out: list of str
222225
N column names to get from the external catalog.
226+
If None, assume ["otype"] for simbad.
223227
types: list of str
224228
N types of columns from the external catalog.
225-
Should be SQL syntax (str=string, etc.)
229+
Should be SQL syntax (str=string, etc.).
230+
If None, return ["str"] for simbad.
226231
227232
Returns
228233
-------
@@ -235,7 +240,7 @@ def xmatch_cds(
235240
236241
# Simbad
237242
>>> df_simbad = xmatch_cds(df)
238-
>>> 'cdsxmatch' in df_simbad.columns
243+
>>> 'simbad_otype' in df_simbad.columns
239244
True
240245
241246
# Gaia
@@ -245,7 +250,7 @@ def xmatch_cds(
245250
... catalogname='vizier:I/355/gaiadr3',
246251
... cols_out=['DR3Name', 'Plx', 'e_Plx'],
247252
... types=['string', 'float', 'float'])
248-
>>> 'Plx' in df_gaia.columns
253+
>>> 'vizier:I/355/gaiadr3_Plx' in df_gaia.columns
249254
True
250255
251256
# VSX
@@ -255,7 +260,7 @@ def xmatch_cds(
255260
... distmaxarcsec=1.5,
256261
... cols_out=['Type'],
257262
... types=['string'])
258-
>>> 'Type' in df_vsx.columns
263+
>>> 'vizier:B/vsx/vsx_Type' in df_vsx.columns
259264
True
260265
261266
# SPICY
@@ -265,13 +270,13 @@ def xmatch_cds(
265270
... distmaxarcsec=1.2,
266271
... cols_out=['SPICY', 'class'],
267272
... types=['int', 'string'])
268-
>>> 'SPICY' in df_spicy.columns
273+
>>> 'vizier:J/ApJS/254/33/table1_SPICY' in df_spicy.columns
269274
True
270275
"""
271276
if cols_in is None:
272277
cols_in = ["diaSource.diaSourceId", "diaSource.ra", "diaSource.dec"]
273278
if cols_out is None:
274-
cols_out = ["main_type"]
279+
cols_out = ["otype"]
275280
if types is None:
276281
types = ["string"]
277282

@@ -285,20 +290,15 @@ def xmatch_cds(
285290
F.lit(catalogname),
286291
F.lit(",".join(cols_out)),
287292
),
288-
).withColumn("xmatch_split", F.split("xmatch", ","))
293+
)
289294

290295
for index, col_, type_ in zip(range(len(cols_out)), cols_out, types):
291296
df_out = df_out.withColumn(
292-
col_, F.col("xmatch_split").getItem(index).astype(type_)
297+
"{}_{}".format(catalogname, col_),
298+
F.split("xmatch", ",").getItem(index).astype(type_),
293299
)
294300

295-
df_out = df_out.drop("xmatch", "xmatch_split")
296-
297-
# Keep compatibility with previous definitions
298-
if "main_type" in df_out.columns:
299-
# remove previous declaration if any
300-
df_out = df_out.drop("cdsxmatch")
301-
df_out = df_out.withColumnRenamed("main_type", "cdsxmatch")
301+
df_out = df_out.drop("xmatch")
302302

303303
return df_out
304304

@@ -315,7 +315,7 @@ def xmatch_tns(df, distmaxarcsec=1.5, tns_raw_output=""):
315315
tns_raw_output: str, optional
316316
Folder that contains raw TNS catalog. Inside, it is expected
317317
to find the file `tns_raw.parquet` downloaded using
318-
`fink-broker/bin/download_tns.py`. Default is "", in
318+
`fink-broker/bin/download_tns.py`. Default is None, in
319319
which case the catalog will be downloaded. Beware that
320320
to download the catalog, you need to set environment variables:
321321
- TNS_API_MARKER: path to the TNS API marker (tns_marker.txt)
@@ -333,11 +333,11 @@ def xmatch_tns(df, distmaxarcsec=1.5, tns_raw_output=""):
333333
>>> curdir = os.path.dirname(os.path.abspath(__file__))
334334
>>> path = curdir + '/data/catalogs'
335335
>>> df_tns = xmatch_tns(df, tns_raw_output=path)
336-
>>> 'tns' in df_tns.columns
336+
>>> 'tns_type' in df_tns.columns
337337
True
338338
339-
>>> df_tns.filter(df_tns["tns"] != "").count()
340-
0
339+
>>> df_tns.filter(df_tns["tns_type"].isNull()).count()
340+
50
341341
342342
"""
343343
if tns_raw_output == "":
@@ -351,8 +351,10 @@ def xmatch_tns(df, distmaxarcsec=1.5, tns_raw_output=""):
351351
_LOG.warning(
352352
"TNS_API_MARKER and TNS_API_KEY are not defined as env var in the master."
353353
)
354-
_LOG.warning("Skipping crossmatch with TNS.")
355-
df = df.withColumn("tns", F.lit(""))
354+
_LOG.warning(
355+
"Skipping crossmatch with TNS. Creating a tns_type columns with null values."
356+
)
357+
df = df.withColumn("tns_type", F.lit(None))
356358
return df
357359
else:
358360
pdf_tns = pd.read_parquet(os.path.join(tns_raw_output, "tns_raw.parquet"))
@@ -380,7 +382,7 @@ def crossmatch_with_tns(diaSourceId, ra, dec):
380382
Returns
381383
-------
382384
to_return: pd.Series of str
383-
TNS type for the alert. `Unknown` if no match.
385+
TNS type for the alert. null if no match.
384386
"""
385387
pdf = pdf_tns_filt_b.value
386388
ra2, dec2, type2 = pdf["ra"], pdf["declination"], pdf["type"]
@@ -410,21 +412,21 @@ def crossmatch_with_tns(diaSourceId, ra, dec):
410412
# set separation length
411413
sep_constraint2 = d2d2.degree < distmaxarcsec / 3600.0
412414

413-
sub_pdf["TNS"] = [""] * len(sub_pdf)
415+
sub_pdf["TNS"] = [None] * len(sub_pdf)
414416
sub_pdf["TNS"][sep_constraint2] = type2.to_numpy()[idx2[sep_constraint2]]
415417

416418
# Here we take the first match
417419
# What if there are many? AT & SN?
418420
to_return = diaSourceId.apply(
419-
lambda x: ""
421+
lambda x: None
420422
if x not in sub_pdf["diaSourceId"].to_numpy()
421423
else sub_pdf["TNS"][sub_pdf["diaSourceId"] == x].to_numpy()[0]
422424
)
423425

424426
return to_return
425427

426428
df = df.withColumn(
427-
"tns",
429+
"tns_type",
428430
crossmatch_with_tns(
429431
df["diaSource.diaSourceId"], df["diaSource.ra"], df["diaSource.dec"]
430432
),
@@ -460,7 +462,7 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
460462
Returns
461463
-------
462464
type: str
463-
Object type from the catalog. `Unknown` if no match.
465+
Object type from the catalog. null if no match.
464466
465467
Examples
466468
--------
@@ -490,28 +492,28 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
490492
... 'gcvs',
491493
... crossmatch_other_catalog(df['id'], df['ra'], df['dec'], lit('gcvs'))
492494
... ).show() # doctest: +NORMALIZE_WHITESPACE
493-
+---+-----------+-----------+-------+
494-
| id| ra| dec| gcvs|
495-
+---+-----------+-----------+-------+
496-
| 1| 26.8566983|-26.9677112|Unknown|
497-
| 2|101.3520545| 24.5421872| RR|
498-
| 3| 0.3126| 47.6859|Unknown|
499-
| 4| 0.31820833|29.59277778|Unknown|
500-
+---+-----------+-----------+-------+
495+
+---+-----------+-----------+----+
496+
| id| ra| dec|gcvs|
497+
+---+-----------+-----------+----+
498+
| 1| 26.8566983|-26.9677112|null|
499+
| 2|101.3520545| 24.5421872| RR|
500+
| 3| 0.3126| 47.6859|null|
501+
| 4| 0.31820833|29.59277778|null|
502+
+---+-----------+-----------+----+
501503
<BLANKLINE>
502504
503505
>>> df.withColumn(
504506
... 'vsx',
505507
... crossmatch_other_catalog(df['id'], df['ra'], df['dec'], lit('vsx'))
506508
... ).show() # doctest: +NORMALIZE_WHITESPACE
507-
+---+-----------+-----------+-------+
508-
| id| ra| dec| vsx|
509-
+---+-----------+-----------+-------+
510-
| 1| 26.8566983|-26.9677112| MISC|
511-
| 2|101.3520545| 24.5421872| RRAB|
512-
| 3| 0.3126| 47.6859|Unknown|
513-
| 4| 0.31820833|29.59277778|Unknown|
514-
+---+-----------+-----------+-------+
509+
+---+-----------+-----------+----+
510+
| id| ra| dec| vsx|
511+
+---+-----------+-----------+----+
512+
| 1| 26.8566983|-26.9677112|MISC|
513+
| 2|101.3520545| 24.5421872|RRAB|
514+
| 3| 0.3126| 47.6859|null|
515+
| 4| 0.31820833|29.59277778|null|
516+
+---+-----------+-----------+----+
515517
<BLANKLINE>
516518
517519
>>> df.withColumn(
@@ -521,9 +523,9 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
521523
+---+-----------+-----------+--------------------+
522524
| id| ra| dec| 3hsp|
523525
+---+-----------+-----------+--------------------+
524-
| 1| 26.8566983|-26.9677112| Unknown|
525-
| 2|101.3520545| 24.5421872| Unknown|
526-
| 3| 0.3126| 47.6859| Unknown|
526+
| 1| 26.8566983|-26.9677112| null|
527+
| 2|101.3520545| 24.5421872| null|
528+
| 3| 0.3126| 47.6859| null|
527529
| 4| 0.31820833|29.59277778|3HSPJ000116.4+293534|
528530
+---+-----------+-----------+--------------------+
529531
<BLANKLINE>
@@ -535,10 +537,10 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
535537
+---+-----------+-----------+-----------------+
536538
| id| ra| dec| 4lac|
537539
+---+-----------+-----------+-----------------+
538-
| 1| 26.8566983|-26.9677112| Unknown|
539-
| 2|101.3520545| 24.5421872| Unknown|
540+
| 1| 26.8566983|-26.9677112| null|
541+
| 2|101.3520545| 24.5421872| null|
540542
| 3| 0.3126| 47.6859|4FGL J0001.2+4741|
541-
| 4| 0.31820833|29.59277778| Unknown|
543+
| 4| 0.31820833|29.59277778| null|
542544
+---+-----------+-----------+-----------------+
543545
<BLANKLINE>
544546
"""
@@ -578,7 +580,7 @@ def crossmatch_other_catalog(diaSourceId, ra, dec, catalog_name, radius_arcsec=N
578580
pdf, catalog_rubin, catalog_other, radius_arcsec=radius_arcsec
579581
)
580582

581-
pdf_merge["Type"] = "Unknown"
583+
pdf_merge["Type"] = None
582584
pdf_merge.loc[mask, "Type"] = [
583585
str(i).strip() for i in type2.astype(str).to_numpy()[idx2]
584586
]
@@ -605,7 +607,7 @@ def crossmatch_mangrove(diaSourceId, ra, dec, radius_arcsec=None):
605607
Returns
606608
-------
607609
type: str
608-
Object type from the catalog. `Unknown` if no match.
610+
Object type from the catalog. null if no match.
609611
610612
Examples
611613
--------
@@ -637,9 +639,9 @@ def crossmatch_mangrove(diaSourceId, ra, dec, radius_arcsec=None):
637639
... ).toPandas() # doctest: +NORMALIZE_WHITESPACE
638640
id ra dec mangrove
639641
0 1 198.955536 42.029289 {'HyperLEDA_name': 'NGC5055', '2MASS_name': '1...
640-
1 2 101.352054 24.542187 {'HyperLEDA_name': 'None', '2MASS_name': 'None...
641-
2 3 0.312600 47.685900 {'HyperLEDA_name': 'None', '2MASS_name': 'None...
642-
3 4 0.318208 29.592778 {'HyperLEDA_name': 'None', '2MASS_name': 'None...
642+
1 2 101.352054 24.542187 {'HyperLEDA_name': None, '2MASS_name': None, '...
643+
2 3 0.312600 47.685900 {'HyperLEDA_name': None, '2MASS_name': None, '...
644+
3 4 0.318208 29.592778 {'HyperLEDA_name': None, '2MASS_name': None, '...
643645
"""
644646
pdf = pd.DataFrame({
645647
"ra": ra.to_numpy(),
@@ -666,7 +668,7 @@ def crossmatch_mangrove(diaSourceId, ra, dec, radius_arcsec=None):
666668
pdf, catalog_rubin, catalog_other, radius_arcsec=radius_arcsec
667669
)
668670

669-
default = {name: "None" for name in MANGROVE_COLS}
671+
default = {name: None for name in MANGROVE_COLS}
670672
pdf_merge["Type"] = [default for i in range(len(pdf_merge))]
671673
pdf_merge.loc[mask, "Type"] = [payload[i] for i in idx2]
672674

0 commit comments

Comments
 (0)