elainezhu commited on
Commit
d662891
·
verified ·
1 Parent(s): ca323dd

Update form/data/schema.py

Browse files
Files changed (1) hide show
  1. form/data/schema.py +560 -63
form/data/schema.py CHANGED
@@ -1,75 +1,572 @@
1
- from datetime import datetime
2
-
3
- def generate_machine_readable_output(form_data):
4
- """Generate machine-readable JSON-LD output"""
5
- json_ld = {
6
- "@context": "https://schema.org",
7
- "@type": "AIFlawReport",
8
- "reportId": form_data.get("Report ID"),
9
- "dateCreated": datetime.now().isoformat(),
10
- "reportStatus": form_data.get("Report Status"),
11
- "reportTypes": form_data.get("Report Types", []),
12
- "basicInformation": {
13
- "reporterId": form_data.get("Reporter ID"),
14
- "sessionId": form_data.get("Session ID"),
15
- "flawTimestampStart": form_data.get("Flaw Timestamp Start"),
16
- "flawTimestampEnd": form_data.get("Flaw Timestamp End"),
17
- "systems": form_data.get("Systems", [])
18
- },
19
- "commonFields": {
20
- "contextInfo": form_data.get("Context Info"),
21
- "flawDescription": form_data.get("Flaw Description"),
22
- "policyViolation": form_data.get("Policy Violation"),
23
- "severity": form_data.get("Severity"),
24
- "prevalence": form_data.get("Prevalence"),
25
- "impacts": form_data.get("Impacts", []),
26
- "impactedStakeholders": form_data.get("Impacted Stakeholder(s)", []),
27
- "riskSource": form_data.get("Risk Source", []),
28
- "bountyEligibility": form_data.get("Bounty Eligibility")
29
- },
30
- "disclosurePlan": {
31
- "disclosureIntent": form_data.get("Disclosure Intent"),
32
- "disclosureTimeline": form_data.get("Disclosure Timeline"),
33
- "disclosureChannels": form_data.get("Disclosure Channels", []),
34
- "embargoRequest": form_data.get("Embargo Request")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Report-type specific sections
39
- if "Real-World Events" in form_data.get("Report Types", []):
40
- json_ld["realWorldEvent"] = {
41
- "incidentDescription": form_data.get("Description of the Incident(s)"),
42
- "implicatedSystems": form_data.get("Implicated Systems"),
43
- "submitterRelationship": form_data.get("Submitter Relationship"),
44
- "eventDates": form_data.get("Event Date(s)"),
45
- "eventLocations": form_data.get("Event Location(s)"),
46
- "experiencedHarmTypes": form_data.get("Experienced Harm Types", []),
47
- "experiencedHarmSeverity": form_data.get("Experienced Harm Severity"),
48
- "harmNarrative": form_data.get("Harm Narrative")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  }
50
 
51
- if "Malign Actor" in form_data.get("Report Types", []):
52
- json_ld["malignActor"] = {
53
- "tactics": form_data.get("Tactic Select", []),
54
- "impact": form_data.get("Impact", [])
 
 
 
55
  }
56
 
57
- if "Security Incident Report" in form_data.get("Report Types", []):
58
- json_ld["securityIncident"] = {
59
- "threatActorIntent": form_data.get("Threat Actor Intent"),
60
- "detection": form_data.get("Detection", [])
61
  }
62
 
63
- if "Vulnerability Report" in form_data.get("Report Types", []):
64
- json_ld["vulnerability"] = {
65
- "proofOfConcept": form_data.get("Proof-of-Concept Exploit")
 
66
  }
67
 
68
- if "Hazard Report" in form_data.get("Report Types", []):
69
- json_ld["hazard"] = {
70
- "examples": form_data.get("Examples"),
71
- "replicationPacket": form_data.get("Replication Packet"),
72
- "statisticalArgument": form_data.get("Statistical Argument")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- return json_ld
 
 
 
 
 
 
 
1
+ import json
2
+ import hashlib
3
+ from datetime import datetime, timezone
4
+ from typing import Dict, Any, List, Optional, Union
5
+ from pathlib import Path
6
+ from pydantic import BaseModel, Field, validator
7
+ import pyld
8
+ from pyld import jsonld
9
+
10
+ class UnknownAISystem(BaseModel):
11
+ """Schema for unknown AI systems described by users."""
12
+ description: str = Field(..., min_length=1, description="User description of unknown AI system")
13
+
14
+ class RawAIFlawReport(BaseModel):
15
+ """Raw form input - matches actual complete form structure"""
16
+
17
+ class Config:
18
+ extra = "allow"
19
+ populate_by_name = True
20
+
21
+ report_id: Optional[str] = Field(None, alias="Report ID")
22
+ reporter_id: Optional[str] = Field(None, alias="Reporter ID")
23
+ session_id: Optional[str] = Field(None, alias="Session ID")
24
+ systems: Optional[List[str]] = Field(default=[], alias="Systems")
25
+ flaw_timestamp_start: Optional[str] = Field(None, alias="Flaw Timestamp Start")
26
+ submission_timestamp: Optional[str] = Field(None, alias="Submission Timestamp")
27
+
28
+ report_types: List[str] = Field(default=[], alias="Report Types")
29
+
30
+ flaw_description: Optional[str] = Field(None, alias="Flaw Description")
31
+ incident_description: Optional[str] = Field(None, alias="Incident Description")
32
+ incident_description_detailed: Optional[str] = Field(None, alias="Incident Description - Detailed")
33
+ flaw_description_detailed: Optional[str] = Field(None, alias="Flaw Description - Detailed")
34
+
35
+ policy_violation: Optional[str] = Field(None, alias="Policy Violation")
36
+ potential_policy_violation: Optional[str] = Field(None, alias="Potential Policy Violation")
37
+ severity: Optional[str] = Field(None, alias="Severity")
38
+ prevalence: Optional[str] = Field(None, alias="Prevalence")
39
+
40
+ impacts: Optional[List[str]] = Field(default=[], alias="Impacts")
41
+ impacts_other: Optional[str] = Field(None, alias="Impacts_Other")
42
+ specific_harm_types: Optional[List[str]] = Field(default=[], alias="Specific Harm Types")
43
+ impacted_stakeholders: Optional[List[str]] = Field(default=[], alias="Impacted Stakeholder(s)")
44
+ csam_related: Optional[str] = Field(None, alias="CSAM Related")
45
+
46
+ risk_sources: Optional[Dict[str, Any]] = Field(None, alias="Risk Source(s)")
47
+
48
+ context_info: Optional[str] = Field(None, alias="Context Info")
49
+ proof_of_concept: Optional[str] = Field(None, alias="Proof-of-Concept Exploit")
50
+
51
+ submitter_relationship: Optional[str] = Field(None, alias="Submitter Relationship")
52
+ submitter_relationship_other: Optional[str] = Field(None, alias="Submitter_Relationship_Other")
53
+ incident_locations: Optional[str] = Field(None, alias="Incident Location(s)")
54
+ harm_narrative: Optional[str] = Field(None, alias="Harm Narrative")
55
+
56
+ attacker_resources: Optional[List[str]] = Field(default=[], alias="Attacker Resources")
57
+ attacker_objectives: Optional[List[str]] = Field(default=[], alias="Attacker Objectives")
58
+ objective_context: Optional[str] = Field(None, alias="Objective Context")
59
+ detection_methods: Optional[List[str]] = Field(default=[], alias="Detection")
60
+
61
+ statistical_argument: Optional[str] = Field(None, alias="Statistical Argument with Examples")
62
+
63
+ disclosure_intent: str = Field(..., alias="Disclosure Intent")
64
+ disclosure_timeline: Optional[str] = Field(None, alias="Disclosure Timeline")
65
+ disclosure_channels: Optional[List[str]] = Field(default=[], alias="Disclosure Channels")
66
+ disclosure_channels_other: Optional[str] = Field(None, alias="Disclosure_Channels_Other")
67
+ embargo_request: Optional[str] = Field(None, alias="Embargo Request")
68
+
69
+ @validator('disclosure_intent')
70
+ def validate_disclosure_intent(cls, v):
71
+ valid_intents = {'Yes', 'No', 'Undecided', 'Already Public Knowledge'}
72
+ if v not in valid_intents:
73
+ raise ValueError(f'Disclosure intent must be one of: {valid_intents}')
74
+ return v
75
+
76
+ @validator('severity')
77
+ def validate_severity(cls, v):
78
+ if v is None:
79
+ return v
80
+ valid_severities = {
81
+ 'Negligible', 'Low', 'Medium', 'High', 'Critical', 'Significant'
82
  }
83
+ if v not in valid_severities:
84
+ raise ValueError(f'Severity must be one of: {valid_severities}')
85
+ return v
86
+
87
+ class AISystem(BaseModel):
88
+ """Enriched AI System information"""
89
+
90
+ id: str = Field(..., description="System identifier/URL")
91
+ name: str = Field(..., description="System name")
92
+ version: str = Field(..., description="System version")
93
+ slug: str = Field(..., description="Internal slug for lookups")
94
+ display_name: str = Field(..., description="Human-friendly display name")
95
+ system_type: str = Field(default="known", description="'known' or 'unknown'")
96
+ description: Optional[str] = Field(None, description="For unknown systems")
97
+ publisher_info: Optional[Dict[str, Any]] = Field(None, description="Publisher/organization data")
98
+
99
+ class ProcessedAIFlawReport(BaseModel):
100
+ """Fully processed flaw report with enriched data"""
101
+ report_id: str = Field(..., description="Generated report ID")
102
+ created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
103
+
104
+ ai_systems: List[AISystem] = Field(..., description="Fully enriched system data")
105
+
106
+ # Core Flaw Data
107
+ reporter_id: Optional[str] = None
108
+ session_id: Optional[str] = None
109
+ flaw_timestamp_start: Optional[str] = None
110
+ flaw_description: str
111
+ policy_violation: str
112
+
113
+ # Assessment Data
114
+ severity: str
115
+ prevalence: str
116
+ impacts: List[str] = Field(default=[])
117
+ specific_harm_types: List[str] = Field(default=[])
118
+ impacted_stakeholders: List[str] = Field(default=[])
119
+
120
+ # Report Classification
121
+ report_types: List[str] = Field(default=[])
122
+
123
+ # Conditional Sections
124
+ incident_data: Optional[Dict[str, Any]] = None
125
+ security_data: Optional[Dict[str, Any]] = None
126
+ vulnerability_data: Optional[Dict[str, Any]] = None
127
+ hazard_data: Optional[Dict[str, Any]] = None
128
+
129
+ # Disclosure Information
130
+ disclosure_intent: str
131
+ disclosure_timeline: Optional[str] = None
132
+ disclosure_channels: List[str] = Field(default=[])
133
+
134
+ raw_data: Optional[Dict[str, Any]] = Field(default=None, repr=False)
135
+
136
+
137
+ def clean_internal_fields(data: Dict[str, Any]) -> Dict[str, Any]:
138
+ """Remove fields starting with underscore for clean JSON-LD output."""
139
+ if isinstance(data, dict):
140
+ return {k: clean_internal_fields(v) for k, v in data.items() if not k.startswith("_")}
141
+ elif isinstance(data, list):
142
+ return [clean_internal_fields(item) for item in data]
143
+ else:
144
+ return data
145
+
146
+ class AIFlawKnowledgeBase:
147
+ """Knowledge base for AI systems and organizations"""
148
+
149
+ def __init__(self, kb_path: str = "knowledge-base"):
150
+ self.kb_path = Path(kb_path)
151
+ self.systems_data = None
152
+ self.organizations_data = None
153
+ self.slug_map = {}
154
+ self._load_knowledge_base()
155
+
156
+ def _load_knowledge_base(self):
157
+ """Load knowledge base files or create minimal fallback data"""
158
+ try:
159
+ if (self.kb_path / "ai-systems.jsonld").exists():
160
+ with open(self.kb_path / "ai-systems.jsonld") as f:
161
+ self.systems_data = json.load(f)
162
+ else:
163
+ self.systems_data = {"@graph": []}
164
+
165
+ if (self.kb_path / "organizations.jsonld").exists():
166
+ with open(self.kb_path / "organizations.jsonld") as f:
167
+ self.organizations_data = json.load(f)
168
+ else:
169
+ self.organizations_data = {"@graph": []}
170
+
171
+ except (FileNotFoundError, json.JSONDecodeError):
172
+ self.systems_data = {"@graph": []}
173
+ self.organizations_data = {"@graph": []}
174
+
175
+ self._build_slug_map()
176
+
177
+
178
+ def _build_slug_map(self):
179
+ """Build slug to system/org mapping"""
180
+ self.slug_map = {}
181
+
182
+ if self.systems_data:
183
+ for system in self.systems_data.get("@graph", []):
184
+ slug = system.get("_aifr_internal", {}).get("slug")
185
+ if slug:
186
+ self.slug_map[slug] = system
187
+
188
+ if self.organizations_data:
189
+ for org in self.organizations_data.get("@graph", []):
190
+ slug = org.get("_aifr_internal", {}).get("slug")
191
+ if slug:
192
+ self.slug_map[slug] = org
193
+
194
+ def find_system_by_name_or_slug(self, identifier: str) -> Optional[Dict[str, Any]]:
195
+ """Find system by name or slug with fuzzy matching"""
196
+ # Direct slug match
197
+ if identifier in self.slug_map:
198
+ return self.slug_map[identifier]
199
+
200
+ # Search by name (case-insensitive partial match)
201
+ identifier_lower = identifier.lower()
202
+ for system in self.systems_data.get("@graph", []):
203
+ name = system.get("name", "").lower()
204
+ display_name = system.get("_aifr_internal", {}).get("displayName", "").lower()
205
+
206
+ if (identifier_lower in name or
207
+ identifier_lower in display_name or
208
+ name in identifier_lower or
209
+ display_name in identifier_lower):
210
+ return system
211
+
212
+ return None
213
+
214
+ def find_organization_by_id(self, org_id: str) -> Optional[Dict[str, Any]]:
215
+ """Find organization by @id URI"""
216
+ for org in self.organizations_data.get("@graph", []):
217
+ if org.get("@id") == org_id:
218
+ return org
219
+ return None
220
 
221
+ def get_system_jsonld(self, identifier: str) -> Optional[Dict[str, Any]]:
222
+ """Get clean JSON-LD representation of system with full publisher data"""
223
+ system = self.find_system_by_name_or_slug(identifier)
224
+ if not system:
225
+ return None
226
+
227
+ jsonld_system = clean_internal_fields(system)
228
+
229
+ if "@type" not in jsonld_system:
230
+ jsonld_system["@type"] = "schema:SoftwareApplication"
231
+
232
+ publisher_id = system.get("publisher", {}).get("@id")
233
+ if publisher_id:
234
+ org_data = self.find_organization_by_id(publisher_id)
235
+ if org_data:
236
+ publisher_jsonld = clean_internal_fields(org_data)
237
+ if "@type" not in publisher_jsonld:
238
+ publisher_jsonld["@type"] = "schema:Organization"
239
+ jsonld_system["publisher"] = publisher_jsonld
240
+
241
+ return jsonld_system
242
+
243
+
244
+ def process_raw_report(raw_data: Dict[str, Any]) -> ProcessedAIFlawReport:
245
+ """Convert raw form data to processed report by resolving AI systems"""
246
+
247
+ raw_report = RawAIFlawReport.model_validate(raw_data)
248
+ kb = AIFlawKnowledgeBase()
249
+
250
+ report_id = raw_data.get("Report ID") or f"AFL-{hashlib.md5(json.dumps(raw_data, sort_keys=True).encode()).hexdigest()[:8]}"
251
+
252
+ ai_systems = []
253
+ systems_list = raw_report.systems or []
254
+ for system_name in systems_list:
255
+ system_data = kb.find_system_by_name_or_slug(system_name)
256
+ if system_data:
257
+ internal_data = system_data.get("_aifr_internal", {})
258
+ ai_system = AISystem(
259
+ id=system_data.get("@id", f"https://aiflawreports.org/systems/{system_name}"),
260
+ name=system_data.get("name", system_name),
261
+ version=system_data.get("version", ""),
262
+ slug=internal_data.get("slug", system_name),
263
+ display_name=internal_data.get("displayName", system_name),
264
+ system_type="known"
265
+ )
266
+ else:
267
+ ai_system = AISystem(
268
+ id=f"https://aiflawreports.org/systems/{system_name.replace(' ', '_')}",
269
+ name=system_name,
270
+ version="",
271
+ slug=system_name,
272
+ display_name=system_name,
273
+ system_type="partially_known"
274
+ )
275
+ ai_systems.append(ai_system)
276
+
277
+ if not ai_systems:
278
+ ai_systems.append(AISystem(
279
+ id=f"https://aiflawreports.org/reports/{report_id}/unknown-system",
280
+ name="Unknown System",
281
+ version="",
282
+ slug="",
283
+ display_name="Unknown System",
284
+ system_type="unknown",
285
+ description="No specific system identified"
286
+ ))
287
+
288
+ if raw_report.incident_description:
289
+ description = raw_report.incident_description
290
+ elif raw_report.incident_description_detailed:
291
+ description = f"**Detailed Description:**\n{raw_report.incident_description_detailed}"
292
+ elif raw_report.flaw_description:
293
+ description = raw_report.flaw_description
294
+ elif raw_report.flaw_description_detailed:
295
+ description = f"**Detailed Description:**\n{raw_report.flaw_description_detailed}"
296
+ else:
297
+ description = "No description provided"
298
+
299
+ policy_violation = raw_report.policy_violation or raw_report.potential_policy_violation or "Not specified"
300
+
301
+ incident_data = None
302
+ if "Real-World Incidents" in raw_report.report_types:
303
+ incident_data = {
304
+ "description": raw_report.incident_description,
305
+ "detailed_description": raw_report.incident_description_detailed,
306
+ "locations": raw_report.incident_locations,
307
+ "harm_narrative": raw_report.harm_narrative,
308
+ "submitter_relationship": raw_report.submitter_relationship,
309
+ "submitter_relationship_other": raw_report.submitter_relationship_other
310
  }
311
 
312
+ security_data = None
313
+ if any(rt in raw_report.report_types for rt in ["Malign Actor", "Security Incident Report"]):
314
+ security_data = {
315
+ "attacker_resources": raw_report.attacker_resources or [],
316
+ "attacker_objectives": raw_report.attacker_objectives or [],
317
+ "objective_context": raw_report.objective_context,
318
+ "detection_methods": raw_report.detection_methods or []
319
  }
320
 
321
+ vulnerability_data = None
322
+ if "Vulnerability Report" in raw_report.report_types:
323
+ vulnerability_data = {
324
+ "proof_of_concept": raw_report.proof_of_concept
325
  }
326
 
327
+ hazard_data = None
328
+ if "Hazard Report" in raw_report.report_types:
329
+ hazard_data = {
330
+ "statistical_argument": raw_report.statistical_argument
331
  }
332
 
333
+ processed_report = ProcessedAIFlawReport(
334
+ report_id=report_id,
335
+ ai_systems=ai_systems,
336
+ reporter_id=raw_report.reporter_id,
337
+ session_id=raw_report.session_id,
338
+ flaw_timestamp_start=raw_report.flaw_timestamp_start,
339
+ flaw_description=description,
340
+ policy_violation=policy_violation,
341
+ severity=raw_report.severity or "Unknown",
342
+ prevalence=raw_report.prevalence or "Unknown",
343
+ impacts=raw_report.impacts or [],
344
+ specific_harm_types=raw_report.specific_harm_types or [],
345
+ impacted_stakeholders=raw_report.impacted_stakeholders or [],
346
+ report_types=raw_report.report_types,
347
+ incident_data=incident_data,
348
+ security_data=security_data,
349
+ vulnerability_data=vulnerability_data,
350
+ hazard_data=hazard_data,
351
+ disclosure_intent=raw_report.disclosure_intent,
352
+ disclosure_timeline=raw_report.disclosure_timeline,
353
+ disclosure_channels=raw_report.disclosure_channels or []
354
+ )
355
+
356
+ processed_report.raw_data = raw_data
357
+ return processed_report
358
+
359
+ def _normalized_description(text: Optional[str]) -> str:
360
+ if not text:
361
+ return "No description provided"
362
+ prefix = "**Detailed Description:**"
363
+ t = text.lstrip()
364
+ if t.startswith(prefix):
365
+ t = t[len(prefix):].lstrip()
366
+ return t
367
+
368
+ def serialize_to_jsonld(processed_report: ProcessedAIFlawReport) -> Dict[str, Any]:
369
+ kb = AIFlawKnowledgeBase()
370
+
371
+ jsonld_systems, system_names = [], []
372
+ for system in processed_report.ai_systems:
373
+ if system.system_type == "unknown":
374
+ jsonld_systems.append({
375
+ "@type": "schema:SoftwareApplication",
376
+ "@id": system.id,
377
+ "name": "Unknown System",
378
+ "description": system.description
379
+ })
380
+ system_names.append("Unknown System")
381
+ else:
382
+ js = kb.get_system_jsonld(system.slug or system.name)
383
+ if js:
384
+ jsonld_systems.append(js)
385
+ system_names.append(system.display_name)
386
+ else:
387
+ jsonld_systems.append({
388
+ "@type": "schema:SoftwareApplication",
389
+ "@id": system.id,
390
+ "name": system.name,
391
+ "version": system.version
392
+ })
393
+ system_names.append(system.display_name)
394
+
395
+ jsonld_report = {
396
+ "@context": [
397
+ "https://schema.org/",
398
+ {
399
+ "aifr": "https://aiflawreports.org/schema/",
400
+ "aiSystem": "aifr:aiSystem",
401
+ "severity": "aifr:severity",
402
+ "prevalence": "aifr:prevalence",
403
+ "impacts": "aifr:impacts",
404
+ "reportType": "aifr:reportType",
405
+ "riskSource": "aifr:riskSource",
406
+ "contextInfo": "aifr:contextInfo"
407
+ }
408
+ ],
409
+ "@type": "aifr:AIFlawReport",
410
+ "@id": f"https://aiflawreports.org/reports/{processed_report.report_id}",
411
+ "name": f"AI Flaw Report: {', '.join(system_names)}",
412
+ "description": _normalized_description(processed_report.flaw_description),
413
+ "aiSystem": jsonld_systems,
414
+ "severity": processed_report.severity,
415
+ "prevalence": processed_report.prevalence,
416
+ "impacts": processed_report.impacts,
417
+ "reportType": processed_report.report_types,
418
+ "dateCreated": processed_report.created_at.isoformat(),
419
+ "identifier": processed_report.report_id,
420
+ "aifr:policyViolation": processed_report.policy_violation,
421
+ }
422
+
423
+ if processed_report.reporter_id:
424
+ jsonld_report["author"] = {"@type": "schema:Person", "identifier": processed_report.reporter_id}
425
+ if processed_report.session_id:
426
+ jsonld_report["aifr:sessionId"] = processed_report.session_id
427
+ if processed_report.flaw_timestamp_start:
428
+ jsonld_report["aifr:flawTimestamp"] = processed_report.flaw_timestamp_start
429
+ if processed_report.impacted_stakeholders:
430
+ jsonld_report["aifr:impactedStakeholders"] = processed_report.impacted_stakeholders
431
+ if processed_report.specific_harm_types:
432
+ jsonld_report["aifr:specificHarmTypes"] = processed_report.specific_harm_types
433
+ if processed_report.incident_data:
434
+ inc = {
435
+ "@type": "aifr:RealWorldIncident",
436
+ "description": processed_report.incident_data.get("description"),
437
+ "location": processed_report.incident_data.get("locations"),
438
+ "aifr:harmNarrative": processed_report.incident_data.get("harm_narrative"),
439
+ }
440
+ if processed_report.incident_data.get("submitter_relationship"):
441
+ inc["aifr:submitterRelationship"] = processed_report.incident_data["submitter_relationship"]
442
+ jsonld_report["aifr:incident"] = inc
443
+ if processed_report.security_data:
444
+ sec = {
445
+ "@type": "aifr:SecurityIncident",
446
+ "aifr:attackerResources": processed_report.security_data.get("attacker_resources", []),
447
+ "aifr:attackerObjectives": processed_report.security_data.get("attacker_objectives", []),
448
+ "aifr:detectionMethods": processed_report.security_data.get("detection_methods", [])
449
  }
450
+ if processed_report.security_data.get("objective_context"):
451
+ sec["aifr:objectiveContext"] = processed_report.security_data["objective_context"]
452
+ jsonld_report["aifr:securityAspect"] = sec
453
+ if processed_report.vulnerability_data:
454
+ jsonld_report["aifr:vulnerability"] = {
455
+ "@type": "aifr:Vulnerability",
456
+ "aifr:proofOfConcept": processed_report.vulnerability_data.get("proof_of_concept")
457
+ }
458
+ if processed_report.hazard_data:
459
+ jsonld_report["aifr:hazard"] = {
460
+ "@type": "aifr:Hazard",
461
+ "aifr:statisticalArgument": processed_report.hazard_data.get("statistical_argument")
462
+ }
463
+
464
+ disclosure = {"@type": "aifr:DisclosurePlan", "aifr:intent": processed_report.disclosure_intent}
465
+ if processed_report.disclosure_timeline:
466
+ disclosure["aifr:timeline"] = processed_report.disclosure_timeline
467
+ if processed_report.disclosure_channels:
468
+ disclosure["aifr:channels"] = processed_report.disclosure_channels
469
+ jsonld_report["aifr:disclosure"] = disclosure
470
+
471
+ if getattr(processed_report, "raw_data", None):
472
+ rd = processed_report.raw_data
473
+ jsonld_report["aifr:raw"] = rd
474
+
475
+ # Map Risk Source(s) -> aifr:riskSource
476
+ rs = rd.get("Risk Source(s)")
477
+ if rs:
478
+ jsonld_report["aifr:riskSource"] = {
479
+ "@type": "aifr:RiskSourceAnalysis",
480
+ "aifr:responsibleFactors": rs.get("Responsible Factors", []),
481
+ "aifr:responsibleFactorsSubcategories": rs.get("Responsible Factors Subcategories", {}),
482
+ "aifr:responsibleFactorsContext": rs.get("Responsible Factors Context", "")
483
+ }
484
+
485
+ # Map Context Info -> aifr:contextInfo
486
+ if "Context Info" in rd and rd["Context Info"] not in (None, ""):
487
+ jsonld_report["aifr:contextInfo"] = rd["Context Info"]
488
+
489
+ # Map Submission Timestamp -> aifr:submissionTimestamp
490
+ if "Submission Timestamp" in rd and rd["Submission Timestamp"]:
491
+ jsonld_report["aifr:submissionTimestamp"] = rd["Submission Timestamp"]
492
+
493
+ if "Embargo Request" in rd and rd["Embargo Request"]:
494
+ jsonld_report["aifr:disclosure"]["aifr:embargoRequest"] = rd["Embargo Request"]
495
+
496
+ return jsonld_report
497
+
498
+
499
+
500
+ def generate_machine_readable_output(form_data: Dict[str, Any]) -> str:
501
+ """
502
+ Main function to replace the original manual JSON-LD generation
503
+ Uses Pydantic validation, knowledge base enrichment, and proper JSON-LD structure
504
+ """
505
+ try:
506
+ processed_report = process_raw_report(form_data)
507
+
508
+ jsonld_report = serialize_to_jsonld(processed_report)
509
+
510
+ try:
511
+ compacted = jsonld.compact(jsonld_report, jsonld_report["@context"])
512
+ return json.dumps(compacted, indent=2)
513
+ except Exception:
514
+ return json.dumps(jsonld_report, indent=2)
515
+
516
+ except Exception as e:
517
+ return json.dumps({
518
+ "@context": "https://schema.org/",
519
+ "@type": "Report",
520
+ "@id": f"https://aiflawreports.org/reports/{form_data.get('Report ID', 'unknown')}",
521
+ "name": "AI Flaw Report",
522
+ "description": form_data.get("Flaw Description", ""),
523
+ "dateCreated": datetime.now(timezone.utc).isoformat(),
524
+ "aifr:processingError": str(e)
525
+ }, indent=2)
526
+
527
+ def validate_jsonld_output(jsonld_string: str) -> tuple[bool, Optional[str]]:
528
+ """
529
+ Validate the generated JSON-LD
530
+ Returns (is_valid, error_message)
531
+ """
532
+ try:
533
+ data = json.loads(jsonld_string)
534
+
535
+ if "@context" not in data:
536
+ return False, "Missing @context"
537
+ if "@type" not in data:
538
+ return False, "Missing @type"
539
+ if "@id" not in data:
540
+ return False, "Missing @id"
541
+
542
+ try:
543
+ expanded = jsonld.expand(data)
544
+ return True, None
545
+ except Exception as e:
546
+ return False, f"JSON-LD expansion error: {e}"
547
+
548
+ except json.JSONDecodeError as e:
549
+ return False, f"Invalid JSON: {e}"
550
+ except Exception as e:
551
+ return False, f"Validation error: {e}"
552
+
553
+
554
+ def update_storage_interface_for_jsonld(form_data: Dict[str, Any]):
555
+ """
556
+ Enhanced storage function that generates both formats
557
+ Can be integrated into existing storage providers
558
+ """
559
+ jsonld_output = generate_machine_readable_output(form_data)
560
+
561
+ is_valid, error = validate_jsonld_output(jsonld_output)
562
+
563
+ if not is_valid:
564
+ print(f"JSON-LD validation warning: {error}")
565
 
566
+ return {
567
+ "form_data": form_data,
568
+ "jsonld": json.loads(jsonld_output),
569
+ "jsonld_string": jsonld_output,
570
+ "validation_status": is_valid,
571
+ "validation_error": error
572
+ }