Skip to content

Commit 337a231

Browse files
authored
feat: rsync rfc content, store in blob, rebuild references (#10255)
* feat: rsync rfc content, store in blob, rebuild references * fix: isolate subprocess. Guard against missing file * fix: correct variable initialization. guard against unnecessary call * test: mock rsync task calls * fix: use list for typing rather than List * fix: string formatting * fix: generalize error string when there are no files to parse * fix: use delete_on_close with NamedTemporaryFile * fix: mtime is less distracting than m_time * fix: store the notprepped file on the fs * fix: typo * fix: fetch json, remove unneeded unlink * chore: ruff * fix: use list for typing * fix: typo * feat: bulk load rfcs into blob storage * fix: restrict the rsync_helper to rsync * test: test ietf.sync.utils * chore: honor typing choices * test: sync task tests * refactor: isolate the rsync from-file construction and test it * chore: ruff * fix: reflect current changes in older test * fix: address incorrect test assumption * chore: adhere to task naming conventions
1 parent 0e9e18e commit 337a231

File tree

8 files changed

+338
-19
lines changed

8 files changed

+338
-19
lines changed

ietf/doc/tasks.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from .utils import (
3030
generate_idnits2_rfc_status,
3131
generate_idnits2_rfcs_obsoleted,
32+
rebuild_reference_relations,
3233
update_or_create_draft_bibxml_file,
3334
ensure_draft_bibxml_path_exists,
3435
investigate_fragment,
@@ -128,3 +129,23 @@ def investigate_fragment_task(name_fragment: str):
128129
"name_fragment": name_fragment,
129130
"results": investigate_fragment(name_fragment),
130131
}
132+
133+
@shared_task
134+
def rebuild_reference_relations_task(doc_names: list[str]):
135+
log.log(f"Task: Rebuilding reference relations for {doc_names}")
136+
for doc in Document.objects.filter(name__in=doc_names, type__in=["rfc", "draft"]):
137+
filenames = dict()
138+
base = (
139+
settings.RFC_PATH
140+
if doc.type_id == "rfc"
141+
else settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR
142+
)
143+
stem = doc.name if doc.type_id == "rfc" else f"{doc.name}-{doc.rev}"
144+
for ext in ["xml", "txt"]:
145+
path = Path(base) / f"{stem}.{ext}"
146+
if path.is_file():
147+
filenames[ext] = str(path)
148+
if len(filenames) > 0:
149+
rebuild_reference_relations(doc, filenames)
150+
else:
151+
log.log(f"Found no content for {stem}")

ietf/doc/tests_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -389,13 +389,13 @@ def test_requires_txt_or_xml(self):
389389
result = rebuild_reference_relations(self.doc, {})
390390
self.assertCountEqual(result.keys(), ['errors'])
391391
self.assertEqual(len(result['errors']), 1)
392-
self.assertIn('No Internet-Draft text available', result['errors'][0],
392+
self.assertIn('No file available', result['errors'][0],
393393
'Error should be reported if no Internet-Draft file is given')
394394

395395
result = rebuild_reference_relations(self.doc, {'md': 'cant-do-this.md'})
396396
self.assertCountEqual(result.keys(), ['errors'])
397397
self.assertEqual(len(result['errors']), 1)
398-
self.assertIn('No Internet-Draft text available', result['errors'][0],
398+
self.assertIn('No file available', result['errors'][0],
399399
'Error should be reported if no XML or plaintext file is given')
400400

401401
@patch.object(XMLDraft, 'get_refs')

ietf/doc/utils.py

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -941,50 +941,66 @@ def rebuild_reference_relations(doc, filenames):
941941
942942
filenames should be a dict mapping file ext (i.e., type) to the full path of each file.
943943
"""
944-
if doc.type.slug != 'draft':
944+
if doc.type.slug not in ["draft", "rfc"]:
945945
return None
946+
947+
log.log(f"Rebuilding reference relations for {doc.name}")
946948

947949
# try XML first
948-
if 'xml' in filenames:
949-
refs = XMLDraft(filenames['xml']).get_refs()
950-
elif 'txt' in filenames:
951-
filename = filenames['txt']
950+
if "xml" in filenames:
951+
refs = XMLDraft(filenames["xml"]).get_refs()
952+
elif "txt" in filenames:
953+
filename = filenames["txt"]
952954
try:
953955
refs = draft.PlaintextDraft.from_file(filename).get_refs()
954956
except IOError as e:
955-
return { 'errors': ["%s :%s" % (e.strerror, filename)] }
957+
return {"errors": [f"{e.strerror}: {filename}"]}
956958
else:
957-
return {'errors': ['No Internet-Draft text available for rebuilding reference relations. Need XML or plaintext.']}
959+
return {
960+
"errors": [
961+
"No file available for rebuilding reference relations. Need XML or plaintext."
962+
]
963+
}
958964

959-
doc.relateddocument_set.filter(relationship__slug__in=['refnorm','refinfo','refold','refunk']).delete()
965+
doc.relateddocument_set.filter(
966+
relationship__slug__in=["refnorm", "refinfo", "refold", "refunk"]
967+
).delete()
960968

961969
warnings = []
962970
errors = []
963971
unfound = set()
964-
for ( ref, refType ) in refs.items():
972+
for ref, refType in refs.items():
965973
refdoc = Document.objects.filter(name=ref)
966974
if not refdoc and re.match(r"^draft-.*-\d{2}$", ref):
967975
refdoc = Document.objects.filter(name=ref[:-3])
968976
count = refdoc.count()
969977
if count == 0:
970-
unfound.add( "%s" % ref )
978+
unfound.add("%s" % ref)
971979
continue
972980
elif count > 1:
973-
errors.append("Too many Document objects found for %s"%ref)
981+
errors.append("Too many Document objects found for %s" % ref)
974982
else:
975983
# Don't add references to ourself
976984
if doc != refdoc[0]:
977-
RelatedDocument.objects.get_or_create( source=doc, target=refdoc[ 0 ], relationship=DocRelationshipName.objects.get( slug='ref%s' % refType ) )
985+
RelatedDocument.objects.get_or_create(
986+
source=doc,
987+
target=refdoc[0],
988+
relationship=DocRelationshipName.objects.get(
989+
slug="ref%s" % refType
990+
),
991+
)
978992
if unfound:
979-
warnings.append('There were %d references with no matching Document'%len(unfound))
993+
warnings.append(
994+
"There were %d references with no matching Document" % len(unfound)
995+
)
980996

981997
ret = {}
982998
if errors:
983-
ret['errors']=errors
999+
ret["errors"] = errors
9841000
if warnings:
985-
ret['warnings']=warnings
1001+
ret["warnings"] = warnings
9861002
if unfound:
987-
ret['unfound']=list(unfound)
1003+
ret["unfound"] = list(unfound)
9881004

9891005
return ret
9901006

ietf/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,7 @@ def skip_unreadable_post(record):
811811
"polls",
812812
"procmaterials",
813813
"review",
814+
"rfc",
814815
"slides",
815816
"staging",
816817
"statchg",

ietf/sync/tasks.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#
55
import datetime
66
import io
7+
from pathlib import Path
8+
from tempfile import NamedTemporaryFile
79
import requests
810

911
from celery import shared_task
@@ -12,9 +14,11 @@
1214
from django.utils import timezone
1315

1416
from ietf.doc.models import DocEvent, RelatedDocument
17+
from ietf.doc.tasks import rebuild_reference_relations_task
1518
from ietf.sync import iana
1619
from ietf.sync import rfceditor
1720
from ietf.sync.rfceditor import MIN_QUEUE_RESULTS, parse_queue, update_drafts_from_queue
21+
from ietf.sync.utils import build_from_file_content, load_rfcs_into_blobdb, rsync_helper
1822
from ietf.utils import log
1923
from ietf.utils.timezone import date_today
2024

@@ -65,11 +69,16 @@ def rfc_editor_index_update_task(full_index=False):
6569
if len(errata_data) < rfceditor.MIN_ERRATA_RESULTS:
6670
log.log("Not enough errata entries, only %s" % len(errata_data))
6771
return # failed
72+
newly_published = set()
6873
for rfc_number, changes, doc, rfc_published in rfceditor.update_docs_from_rfc_index(
6974
index_data, errata_data, skip_older_than_date=skip_date
7075
):
7176
for c in changes:
7277
log.log("RFC%s, %s: %s" % (rfc_number, doc.name, c))
78+
if rfc_published:
79+
newly_published.add(rfc_number)
80+
if len(newly_published) > 0:
81+
rsync_rfcs_from_rfceditor_task.delay(list(newly_published))
7382

7483

7584
@shared_task
@@ -222,3 +231,44 @@ def fix_subseries_docevents_task():
222231
DocEvent.objects.filter(type="sync_from_rfc_editor", desc=desc).update(
223232
time=obsoleting_time
224233
)
234+
235+
@shared_task
236+
def rsync_rfcs_from_rfceditor_task(rfc_numbers: list[int]):
237+
log.log(f"Rsyncing rfcs from rfc-editor: {rfc_numbers}")
238+
from_file = None
239+
with NamedTemporaryFile(mode="w", delete_on_close=False) as fp:
240+
fp.write(build_from_file_content(rfc_numbers))
241+
fp.close()
242+
from_file = Path(fp.name)
243+
rsync_helper(
244+
[
245+
"-a",
246+
"--ignore-existing",
247+
f"--include-from={from_file}",
248+
"--exclude=*",
249+
"rsync.rfc-editor.org::rfcs/",
250+
f"{settings.RFC_PATH}",
251+
]
252+
)
253+
load_rfcs_into_blobdb(rfc_numbers)
254+
255+
rebuild_reference_relations_task.delay([f"rfc{num}" for num in rfc_numbers])
256+
257+
258+
@shared_task
259+
def load_rfcs_into_blobdb_task(start: int, end: int):
260+
"""Move file content for rfcs from rfc{start} to rfc{end} inclusive
261+
262+
As this is expected to be removed once the blobdb is populated, it
263+
will truncate its work to a coded max end.
264+
This will not overwrite any existing blob content, and will only
265+
log a small complaint if asked to load a non-exsiting RFC.
266+
"""
267+
# Protect us from ourselves
268+
if end < start:
269+
return
270+
if start < 1:
271+
start = 1
272+
if end > 11000: # Arbitrarily chosen
273+
end = 11000
274+
load_rfcs_into_blobdb(list(range(start, end + 1)))

ietf/sync/tests.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -889,8 +889,9 @@ class TaskTests(TestCase):
889889
@mock.patch("ietf.sync.tasks.rfceditor.update_docs_from_rfc_index")
890890
@mock.patch("ietf.sync.tasks.rfceditor.parse_index")
891891
@mock.patch("ietf.sync.tasks.requests.get")
892+
@mock.patch("ietf.sync.tasks.rsync_rfcs_from_rfceditor_task.delay")
892893
def test_rfc_editor_index_update_task(
893-
self, requests_get_mock, parse_index_mock, update_docs_mock
894+
self, rsync_task_mock, requests_get_mock, parse_index_mock, update_docs_mock
894895
) -> None: # the annotation here prevents mypy from complaining about annotation-unchecked
895896
"""rfc_editor_index_update_task calls helpers correctly
896897
@@ -922,6 +923,7 @@ def json(self):
922923
rfc = RfcFactory()
923924

924925
# Test with full_index = False
926+
rsync_task_mock.return_value = None
925927
requests_get_mock.side_effect = (index_response, errata_response) # will step through these
926928
parse_index_mock.return_value = MockIndexData(length=rfceditor.MIN_INDEX_RESULTS)
927929
update_docs_mock.return_value = (
@@ -947,10 +949,13 @@ def json(self):
947949
)
948950
self.assertIsNotNone(update_docs_kwargs["skip_older_than_date"])
949951

952+
self.assertFalse(rsync_task_mock.called)
953+
950954
# Test again with full_index = True
951955
requests_get_mock.reset_mock()
952956
parse_index_mock.reset_mock()
953957
update_docs_mock.reset_mock()
958+
rsync_task_mock.reset_mock()
954959
requests_get_mock.side_effect = (index_response, errata_response) # will step through these
955960
tasks.rfc_editor_index_update_task(full_index=True)
956961

@@ -971,40 +976,67 @@ def json(self):
971976
)
972977
self.assertIsNone(update_docs_kwargs["skip_older_than_date"])
973978

979+
self.assertFalse(rsync_task_mock.called)
980+
981+
# Test again where the index would cause a new RFC to come into existance
982+
requests_get_mock.reset_mock()
983+
parse_index_mock.reset_mock()
984+
update_docs_mock.reset_mock()
985+
rsync_task_mock.reset_mock()
986+
requests_get_mock.side_effect = (
987+
index_response,
988+
errata_response,
989+
) # will step through these
990+
update_docs_mock.return_value = (
991+
(rfc.rfc_number, ("something changed",), rfc, True),
992+
)
993+
tasks.rfc_editor_index_update_task(full_index=True)
994+
self.assertTrue(rsync_task_mock.called)
995+
rsync_task_args, rsync_task_kwargs = rsync_task_mock.call_args
996+
self.assertEqual((([rfc.rfc_number],), {}), (rsync_task_args, rsync_task_kwargs))
997+
974998
# Test error handling
975999
requests_get_mock.reset_mock()
9761000
parse_index_mock.reset_mock()
9771001
update_docs_mock.reset_mock()
1002+
rsync_task_mock.reset_mock()
9781003
requests_get_mock.side_effect = requests.Timeout # timeout on every get()
9791004
tasks.rfc_editor_index_update_task(full_index=False)
9801005
self.assertFalse(parse_index_mock.called)
9811006
self.assertFalse(update_docs_mock.called)
1007+
self.assertFalse(rsync_task_mock.called)
9821008

9831009
requests_get_mock.reset_mock()
9841010
parse_index_mock.reset_mock()
9851011
update_docs_mock.reset_mock()
1012+
rsync_task_mock.reset_mock()
9861013
requests_get_mock.side_effect = [index_response, requests.Timeout] # timeout second get()
9871014
tasks.rfc_editor_index_update_task(full_index=False)
9881015
self.assertFalse(update_docs_mock.called)
1016+
self.assertFalse(rsync_task_mock.called)
9891017

9901018
requests_get_mock.reset_mock()
9911019
parse_index_mock.reset_mock()
9921020
update_docs_mock.reset_mock()
1021+
rsync_task_mock.reset_mock()
9931022
requests_get_mock.side_effect = [index_response, errata_response]
9941023
# feed in an index that is too short
9951024
parse_index_mock.return_value = MockIndexData(length=rfceditor.MIN_INDEX_RESULTS - 1)
9961025
tasks.rfc_editor_index_update_task(full_index=False)
9971026
self.assertTrue(parse_index_mock.called)
9981027
self.assertFalse(update_docs_mock.called)
1028+
self.assertFalse(rsync_task_mock.called)
9991029

10001030
requests_get_mock.reset_mock()
10011031
parse_index_mock.reset_mock()
10021032
update_docs_mock.reset_mock()
1033+
rsync_task_mock.reset_mock()
10031034
requests_get_mock.side_effect = [index_response, errata_response]
10041035
errata_response.json_length = rfceditor.MIN_ERRATA_RESULTS - 1 # too short
10051036
parse_index_mock.return_value = MockIndexData(length=rfceditor.MIN_INDEX_RESULTS)
10061037
tasks.rfc_editor_index_update_task(full_index=False)
10071038
self.assertFalse(update_docs_mock.called)
1039+
self.assertFalse(rsync_task_mock.called)
10081040

10091041
@override_settings(RFC_EDITOR_QUEUE_URL="https://rfc-editor.example.com/queue/")
10101042
@mock.patch("ietf.sync.tasks.update_drafts_from_queue")
@@ -1134,3 +1166,51 @@ def test_iana_protocols_update_task(
11341166
self.assertTrue(requests_get_mock.called)
11351167
self.assertFalse(parse_protocols_mock.called)
11361168
self.assertFalse(update_rfc_log_mock.called)
1169+
1170+
@mock.patch("ietf.sync.tasks.rsync_helper")
1171+
@mock.patch("ietf.sync.tasks.load_rfcs_into_blobdb")
1172+
@mock.patch("ietf.sync.tasks.rebuild_reference_relations_task.delay")
1173+
def test_rsync_rfcs_from_rfceditor_task(
1174+
self,
1175+
rebuild_relations_mock,
1176+
load_blobs_mock,
1177+
rsync_helper_mock,
1178+
):
1179+
tasks.rsync_rfcs_from_rfceditor_task([12345, 54321])
1180+
self.assertTrue(rsync_helper_mock.called)
1181+
self.assertTrue(load_blobs_mock.called)
1182+
load_blobs_args, load_blobs_kwargs = load_blobs_mock.call_args
1183+
self.assertEqual(load_blobs_args, ([12345, 54321],))
1184+
self.assertEqual(load_blobs_kwargs, {})
1185+
self.assertTrue(rebuild_relations_mock.called)
1186+
rebuild_args, rebuild_kwargs = rebuild_relations_mock.call_args
1187+
self.assertEqual(rebuild_args, (["rfc12345", "rfc54321"],))
1188+
self.assertEqual(rebuild_kwargs, {})
1189+
1190+
@mock.patch("ietf.sync.tasks.load_rfcs_into_blobdb")
1191+
def test_load_rfcs_into_blobdb_task(
1192+
self,
1193+
load_blobs_mock,
1194+
):
1195+
tasks.load_rfcs_into_blobdb_task(5, 3)
1196+
self.assertFalse(load_blobs_mock.called)
1197+
load_blobs_mock.reset_mock()
1198+
tasks.load_rfcs_into_blobdb_task(-1, 1)
1199+
self.assertTrue(load_blobs_mock.called)
1200+
mock_args, mock_kwargs = load_blobs_mock.call_args
1201+
self.assertEqual(mock_args, ([1],))
1202+
self.assertEqual(mock_kwargs, {})
1203+
load_blobs_mock.reset_mock()
1204+
tasks.load_rfcs_into_blobdb_task(10999, 50000)
1205+
self.assertTrue(load_blobs_mock.called)
1206+
mock_args, mock_kwargs = load_blobs_mock.call_args
1207+
self.assertEqual(mock_args, ([10999, 11000],))
1208+
self.assertEqual(mock_kwargs, {})
1209+
load_blobs_mock.reset_mock()
1210+
tasks.load_rfcs_into_blobdb_task(3261, 3263)
1211+
self.assertTrue(load_blobs_mock.called)
1212+
mock_args, mock_kwargs = load_blobs_mock.call_args
1213+
self.assertEqual(mock_args, ([3261, 3262, 3263],))
1214+
self.assertEqual(mock_kwargs, {})
1215+
1216+

0 commit comments

Comments
 (0)