Initial commit

2025-09-22 14:54:12 -04:00 · 2025-09-22 14:54:12 -04:00 · 56a79e3562
commit 56a79e3562
55 changed files with 1905 additions and 0 deletions
--- a/src/nrsk/init.py
+++ b/src/nrsk/init.py
--- a/src/nrsk/regs/init.py
+++ b/src/nrsk/regs/init.py
--- a/src/nrsk/regs/load_10cfr50.py
+++ b/src/nrsk/regs/load_10cfr50.py
@ -0,0 +1,168 @@
+"""
+Read text 10 CFR 50 App B and turn to json and xlsx.
+
+Text should be loaded from web and pasted into data folder.
+
+Needs to eventually write requirements data to flow down to QA program.
+
+One way to do that is to write a ``needs.json`` file to the format
+described `here <https://sphinx-needs.readthedocs.io/en/latest/builders.html#format>`_.
+"""
+
+import json
+import logging
+import re
+
+import requests
+from bs4 import BeautifulSoup
+
+from nrsk.utils import excel, get_project_root, needs, tokenize
+
+logging.basicConfig(level=logging.INFO)
+
+ROOT = get_project_root()
+LOG = logging.getLogger(__name__)
+
+TXT_10_CFR_50_APPA = ROOT / "data" / "regs" / "10-cfr-50-app-a.txt"
+TXT_10_CFR_50_APPB = ROOT / "data" / "regs" / "10-cfr-50-app-b.txt"
+OUT_PATH = ROOT / "documents" / "generated_assets"
+
+# todo: move to config file in case they change?
+URL_APP_A = (
+    "https://www.nrc.gov/reading-rm/doc-collections/cfr/part050/part050-appa.html"
+)
+URL_APP_B = (
+    "https://www.nrc.gov/reading-rm/doc-collections/cfr/part050/part050-appb.html"
+)
+
+
+def fetch_10cfr50_app_a():
+    """Go online and get the text of 10CFR50 App A.
+
+    We still commit the results to the repo because this system
+    may be run in an environment that does not have internet access.
+
+    This is here to show how the committed files are created in the first place.
+
+    Note that this may require changes if the NRC updates the format of the
+    webpage.
+    """
+
+    def _is_criteria_label(tag):
+        return tag.name == "p" and tag.string == "Criteria"
+
+    html_app_a = requests.get(URL_APP_A).text
+    soup = BeautifulSoup(html_app_a, "html.parser")
+    # criteria are all in the last general-content div
+    content = soup.find_all("div", class_="general-content")[-1]
+    label = content.find_next(_is_criteria_label)
+
+    with TXT_10_CFR_50_APPA.open("w") as f:
+        for crit in label.find_all_next("p"):
+            if "1 Further details relating" in crit.text:
+                # marks the end
+                break
+            f.write(crit.text + "\n\n")
+
+
+def process_10cfr50_app_a():
+    """
+    Convert App A text (General Design Criteria) into Sphinx Needs.
+
+    Each criteria actually has a name and then a number of shall sentences
+    below it. So each number should make 1 main need + numerous child needs.
+
+    Input text file should start in the Criteria section, e.g. at Overall
+    Requirements.
+
+    Since this makes parent/child requirements, we'll try using the Sphinx-needs
+    list2need feature for this one.
+
+    https://sphinx-needs.readthedocs.io/en/latest/directives/list2need.html
+
+    list2need is messing up labels on (1) and (2) listings in the text so I'm going to remap
+    those over to 1. and 2., etc.
+
+    Groan it's also messing up on non-numerical parentheticals. Must be a sphinx-needs bug.
+    Will map those to comma phrases.
+
+    Groan even then it's still messed up.
+
+    .. todo:: submit a bug report to sphinx-needs with these examples.
+    """
+
+    def process(line):
+        line = re.sub(r"\((\d+)\)", r"\1.", line)
+        line = re.sub(r"\((.+)\)", r", \1,", line)
+        return line
+
+    needs_lines = [".. list2need::", "  :types: req, req, req", ""]
+    with open(TXT_10_CFR_50_APPA) as inp_f:
+        for line in inp_f:
+            line = line.strip()
+            if not line:
+                continue
+            if re.search(r"^[A-Z]+\. [A-Z]", line):
+                LOG.info("Reading %s", line)
+            elif match := re.search(r"^Criterion (\d+)—(.+?)\.(.+)", line):
+                num = match.group(1).strip()
+                title = match.group(2).strip()
+                subnum = 1
+                LOG.info(
+                    "Found Criterion %s - %s",
+                    match.group(1).strip(),
+                    match.group(2).strip(),
+                )
+                needs_lines.append(f"  * (R_GDC_{num}){process(title)}")
+                sentences = tokenize.tokenize_sentences(match.group(3).strip())
+                for sent in sentences:
+                    needs_lines.append(
+                        f"    * (R_GDC_{num}_{subnum}){process(sent.strip())}"
+                    )
+                    subnum += 1
+            else:
+                sentences = tokenize.tokenize_sentences(line)
+                for sent in sentences:
+                    needs_lines.append(
+                        f"    * (R_GDC_{num}_{subnum}){process(sent.strip())}"
+                    )
+                    subnum += 1
+
+    with open(OUT_PATH / "10-cfr-50-app-a-list.rst", "w") as out_f:
+        out_f.write("\n".join(needs_lines))
+
+
+def process_10cfr50_app_b():
+    """
+    Read 10 CFR 50 App B and generate needs data from it, and xlsx.
+
+    Just tokenizing the text straight up works ok.
+    """
+    with open(TXT_10_CFR_50_APPB) as inp_f:
+        sentences = tokenize.tokenize_sentences(inp_f.read())
+
+    needs_data = []
+    i = 0
+    for s in sentences:
+        if len(s) < 40:
+            LOG.warning(f"Skipping short sentence: `{s}`")
+            continue
+        needs_data.append((f"10 CFR 50 App B Sentence {i}", s))
+        i += 1
+    with (OUT_PATH / "10-cfr-50-app-b.json").open("w") as f:
+        json.dump(
+            needs.data_to_needs(needs_data, prefix="R_APPB_", links=["R_10CFR50_APPB"]),
+            f,
+            indent=4,
+        )
+
+    filtered_titles, filtered_sentences = zip(*needs_data)
+    wb = excel.make_xlsx_from_sentences(filtered_sentences)
+    wb.save(OUT_PATH / "10-cfr-50-app-b.xlsx")  # can only write to disk, not stream
+
+
+if __name__ == "__main__":
+    # fetch_10cfr50_app_a()
+    OUT_PATH.mkdir(parents=True, exist_ok=True)
+    process_10cfr50_app_a()
+    process_10cfr50_app_b()
--- a/src/nrsk/utils/init.py
+++ b/src/nrsk/utils/init.py
@ -0,0 +1,5 @@
+"""General Utils."""
+from pathlib import Path
+
+def get_project_root() -> Path:
+    return Path(__file__).parent.parent.parent.parent
--- a/src/nrsk/utils/excel.py
+++ b/src/nrsk/utils/excel.py
@ -0,0 +1,12 @@
+"""Excel utils."""
+
+from openpyxl import Workbook
+
+
+def make_xlsx_from_sentences(sentences: list) -> Workbook:
+    """Convert json list of sentences to xlsx  with numbers."""
+    wb = Workbook()
+    ws = wb.active
+    for row_data in enumerate(sentences):
+        ws.append(row_data)
+    return wb
--- a/src/nrsk/utils/forms.py
+++ b/src/nrsk/utils/forms.py
@ -0,0 +1,53 @@
+"""
+Utilities for working with Forms.
+"""
+
+import os
+import subprocess
+
+import yaml
+
+PDFTK = "/usr/bin/pdftk"
+
+
+def load_form_data(file_path):
+    """Load yaml or json form data intended to be populated into a form template."""
+    with open(file_path, "r") as f:
+        if file_path.endswith(".yaml"):
+            return yaml.safe_load(f)
+        elif file_path.endswith(".json"):
+            import json
+
+            return json.load(f)
+
+
+def create_fdf(data, fdf_file):
+    """Create form data in FDF format to fill into a PDF form template."""
+    fdf = "%FDF-1.2\n1 0 obj\n<< /FDF << /Fields ["
+    for key, value in data.items():
+        value = str(value).replace("\n", " ").replace("\r", "")
+        fdf += f"<< /V ({value}) /T ({key}) >>"
+    fdf += "] >> >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF"
+    with open(fdf_file, "w") as f:
+        f.write(fdf)
+
+
+def fill_pdf(template_pdf, fdf_file, output_pdf):
+    """Populate a given pdf form template with form data."""
+    temp_fname = output_pdf + ".filled"
+    subprocess.call([PDFTK, template_pdf, "fill_form", fdf_file, "output", temp_fname])
+    # and flatten it
+    subprocess.call([PDFTK, temp_fname, "output", output_pdf, "flatten"])
+    os.remove(temp_fname)
+
+
+def demo():
+    # Example usage
+    # for checkboxes, use "Yes" and "Off"
+    data = {"Name": "DIF3D", "ExportControlled": "Yes", "Type": "Engineering"}
+    create_fdf(data, "data.fdf")
+    fill_pdf(
+        "software-dedication-form.pdf",
+        "data.fdf",
+        "software-dedication-form-filled.pdf",
+    )
--- a/src/nrsk/utils/needs.py
+++ b/src/nrsk/utils/needs.py
@ -0,0 +1,37 @@
+"""Utils for building/managing sphinx-needs data."""
+
+import datetime
+
+
+def data_to_needs(
+    needs_data: list[str, str],
+    prefix="",
+    type="req",
+    type_name="Requirement",
+    links=None,
+) -> dict:
+    """
+    Convert list of (titles, texts) to valid needs data.
+
+    Format from: https://sphinx-needs.readthedocs.io/en/latest/builders.html#format
+    """
+    now = datetime.datetime.now().isoformat()
+    need_data = {}
+    for i, (title, body) in enumerate(needs_data):
+        id = f"{prefix}{i}"
+        need_data[id] = {
+            "description": body,
+            "id": id,
+            "title": title,
+            "type": type,
+            "type_name": type_name,
+            "links": links or [],
+            "tags": [],
+        }
+    data = {
+        "created": now,
+        "current_version": "1.0",
+        "project": "nuclear reactor starter kit",
+        "versions": {"1.0": {"created": now, "needs": need_data}},
+    }
+    return data
--- a/src/nrsk/utils/tokenize.py
+++ b/src/nrsk/utils/tokenize.py
@ -0,0 +1,15 @@
+"""
+Break input text file into sentences.
+"""
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+
+
+def tokenize_sentences(text: str) -> list:
+    tokenizer = PunktSentenceTokenizer()
+
+    tokenizer.train(text)
+    sentences = tokenizer.tokenize(text)
+
+    return sentences
+
+