Initial commit

This commit is contained in:
Nick Touran 2025-09-22 14:54:12 -04:00
commit 56a79e3562
55 changed files with 1905 additions and 0 deletions

0
src/nrsk/__init__.py Normal file
View file

View file

168
src/nrsk/regs/load_10cfr50.py Executable file
View file

@ -0,0 +1,168 @@
"""
Read text 10 CFR 50 App B and turn to json and xlsx.
Text should be loaded from web and pasted into data folder.
Needs to eventually write requirements data to flow down to QA program.
One way to do that is to write a ``needs.json`` file to the format
described `here <https://sphinx-needs.readthedocs.io/en/latest/builders.html#format>`_.
"""
import json
import logging
import re
import requests
from bs4 import BeautifulSoup
from nrsk.utils import excel, get_project_root, needs, tokenize
logging.basicConfig(level=logging.INFO)
ROOT = get_project_root()
LOG = logging.getLogger(__name__)
TXT_10_CFR_50_APPA = ROOT / "data" / "regs" / "10-cfr-50-app-a.txt"
TXT_10_CFR_50_APPB = ROOT / "data" / "regs" / "10-cfr-50-app-b.txt"
OUT_PATH = ROOT / "documents" / "generated_assets"
# todo: move to config file in case they change?
URL_APP_A = (
"https://www.nrc.gov/reading-rm/doc-collections/cfr/part050/part050-appa.html"
)
URL_APP_B = (
"https://www.nrc.gov/reading-rm/doc-collections/cfr/part050/part050-appb.html"
)
def fetch_10cfr50_app_a():
"""Go online and get the text of 10CFR50 App A.
We still commit the results to the repo because this system
may be run in an environment that does not have internet access.
This is here to show how the committed files are created in the first place.
Note that this may require changes if the NRC updates the format of the
webpage.
"""
def _is_criteria_label(tag):
return tag.name == "p" and tag.string == "Criteria"
html_app_a = requests.get(URL_APP_A).text
soup = BeautifulSoup(html_app_a, "html.parser")
# criteria are all in the last general-content div
content = soup.find_all("div", class_="general-content")[-1]
label = content.find_next(_is_criteria_label)
with TXT_10_CFR_50_APPA.open("w") as f:
for crit in label.find_all_next("p"):
if "1 Further details relating" in crit.text:
# marks the end
break
f.write(crit.text + "\n\n")
def process_10cfr50_app_a():
"""
Convert App A text (General Design Criteria) into Sphinx Needs.
Each criteria actually has a name and then a number of shall sentences
below it. So each number should make 1 main need + numerous child needs.
Input text file should start in the Criteria section, e.g. at Overall
Requirements.
Since this makes parent/child requirements, we'll try using the Sphinx-needs
list2need feature for this one.
https://sphinx-needs.readthedocs.io/en/latest/directives/list2need.html
list2need is messing up labels on (1) and (2) listings in the text so I'm going to remap
those over to 1. and 2., etc.
Groan it's also messing up on non-numerical parentheticals. Must be a sphinx-needs bug.
Will map those to comma phrases.
Groan even then it's still messed up.
.. todo:: submit a bug report to sphinx-needs with these examples.
"""
def process(line):
line = re.sub(r"\((\d+)\)", r"\1.", line)
line = re.sub(r"\((.+)\)", r", \1,", line)
return line
needs_lines = [".. list2need::", " :types: req, req, req", ""]
with open(TXT_10_CFR_50_APPA) as inp_f:
for line in inp_f:
line = line.strip()
if not line:
continue
if re.search(r"^[A-Z]+\. [A-Z]", line):
LOG.info("Reading %s", line)
elif match := re.search(r"^Criterion (\d+)—(.+?)\.(.+)", line):
num = match.group(1).strip()
title = match.group(2).strip()
subnum = 1
LOG.info(
"Found Criterion %s - %s",
match.group(1).strip(),
match.group(2).strip(),
)
needs_lines.append(f" * (R_GDC_{num}){process(title)}")
sentences = tokenize.tokenize_sentences(match.group(3).strip())
for sent in sentences:
needs_lines.append(
f" * (R_GDC_{num}_{subnum}){process(sent.strip())}"
)
subnum += 1
else:
sentences = tokenize.tokenize_sentences(line)
for sent in sentences:
needs_lines.append(
f" * (R_GDC_{num}_{subnum}){process(sent.strip())}"
)
subnum += 1
with open(OUT_PATH / "10-cfr-50-app-a-list.rst", "w") as out_f:
out_f.write("\n".join(needs_lines))
def process_10cfr50_app_b():
"""
Read 10 CFR 50 App B and generate needs data from it, and xlsx.
Just tokenizing the text straight up works ok.
"""
with open(TXT_10_CFR_50_APPB) as inp_f:
sentences = tokenize.tokenize_sentences(inp_f.read())
needs_data = []
i = 0
for s in sentences:
if len(s) < 40:
LOG.warning(f"Skipping short sentence: `{s}`")
continue
needs_data.append((f"10 CFR 50 App B Sentence {i}", s))
i += 1
with (OUT_PATH / "10-cfr-50-app-b.json").open("w") as f:
json.dump(
needs.data_to_needs(needs_data, prefix="R_APPB_", links=["R_10CFR50_APPB"]),
f,
indent=4,
)
filtered_titles, filtered_sentences = zip(*needs_data)
wb = excel.make_xlsx_from_sentences(filtered_sentences)
wb.save(OUT_PATH / "10-cfr-50-app-b.xlsx") # can only write to disk, not stream
if __name__ == "__main__":
# fetch_10cfr50_app_a()
OUT_PATH.mkdir(parents=True, exist_ok=True)
process_10cfr50_app_a()
process_10cfr50_app_b()

View file

@ -0,0 +1,5 @@
"""General Utils."""
from pathlib import Path
def get_project_root() -> Path:
return Path(__file__).parent.parent.parent.parent

12
src/nrsk/utils/excel.py Normal file
View file

@ -0,0 +1,12 @@
"""Excel utils."""
from openpyxl import Workbook
def make_xlsx_from_sentences(sentences: list) -> Workbook:
"""Convert json list of sentences to xlsx with numbers."""
wb = Workbook()
ws = wb.active
for row_data in enumerate(sentences):
ws.append(row_data)
return wb

53
src/nrsk/utils/forms.py Normal file
View file

@ -0,0 +1,53 @@
"""
Utilities for working with Forms.
"""
import os
import subprocess
import yaml
PDFTK = "/usr/bin/pdftk"
def load_form_data(file_path):
"""Load yaml or json form data intended to be populated into a form template."""
with open(file_path, "r") as f:
if file_path.endswith(".yaml"):
return yaml.safe_load(f)
elif file_path.endswith(".json"):
import json
return json.load(f)
def create_fdf(data, fdf_file):
"""Create form data in FDF format to fill into a PDF form template."""
fdf = "%FDF-1.2\n1 0 obj\n<< /FDF << /Fields ["
for key, value in data.items():
value = str(value).replace("\n", " ").replace("\r", "")
fdf += f"<< /V ({value}) /T ({key}) >>"
fdf += "] >> >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF"
with open(fdf_file, "w") as f:
f.write(fdf)
def fill_pdf(template_pdf, fdf_file, output_pdf):
"""Populate a given pdf form template with form data."""
temp_fname = output_pdf + ".filled"
subprocess.call([PDFTK, template_pdf, "fill_form", fdf_file, "output", temp_fname])
# and flatten it
subprocess.call([PDFTK, temp_fname, "output", output_pdf, "flatten"])
os.remove(temp_fname)
def demo():
# Example usage
# for checkboxes, use "Yes" and "Off"
data = {"Name": "DIF3D", "ExportControlled": "Yes", "Type": "Engineering"}
create_fdf(data, "data.fdf")
fill_pdf(
"software-dedication-form.pdf",
"data.fdf",
"software-dedication-form-filled.pdf",
)

37
src/nrsk/utils/needs.py Normal file
View file

@ -0,0 +1,37 @@
"""Utils for building/managing sphinx-needs data."""
import datetime
def data_to_needs(
needs_data: list[str, str],
prefix="",
type="req",
type_name="Requirement",
links=None,
) -> dict:
"""
Convert list of (titles, texts) to valid needs data.
Format from: https://sphinx-needs.readthedocs.io/en/latest/builders.html#format
"""
now = datetime.datetime.now().isoformat()
need_data = {}
for i, (title, body) in enumerate(needs_data):
id = f"{prefix}{i}"
need_data[id] = {
"description": body,
"id": id,
"title": title,
"type": type,
"type_name": type_name,
"links": links or [],
"tags": [],
}
data = {
"created": now,
"current_version": "1.0",
"project": "nuclear reactor starter kit",
"versions": {"1.0": {"created": now, "needs": need_data}},
}
return data

View file

@ -0,0 +1,15 @@
"""
Break input text file into sentences.
"""
from nltk.tokenize.punkt import PunktSentenceTokenizer
def tokenize_sentences(text: str) -> list:
tokenizer = PunktSentenceTokenizer()
tokenizer.train(text)
sentences = tokenizer.tokenize(text)
return sentences