-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdelta_diff.py
More file actions
93 lines (78 loc) · 3.42 KB
/
Copy pathdelta_diff.py
File metadata and controls
93 lines (78 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
"""Position-level diff between two FundsXML files (INITIAL vs DELTA / day-over-day).
Usage: delta_diff.py <old.xml> <new.xml> [--json]
FundsXML's ControlData/DataOperation says INITIAL (full snapshot) or DELTA
(changes only). Independent of that flag, this compares the two position sets
by UniqueID and reports:
added present in new, not in old
removed present in old, not in new
changed value or percentage differs (> 0.005 tolerance)
unchanged count only
Both files are streamed with iterparse (constant memory). Exit code: 0 if the
sets are identical, 1 if there are differences, 2 on usage error.
FundsXML 4.x has no XML namespace — bare element names.
Dependencies: lxml (`pip install lxml`); Python stdlib otherwise.
Security: iterparse runs with resolve_entities=False, no_network=True
(XXE / external-entity safe on untrusted feeds). The 0.005 tolerance absorbs
2-dp rounding so re-exported numbers don't show as spurious "changed".
"""
import json
import sys
from lxml import etree
def index(path):
op = None
out = {}
ctx = etree.iterparse(path, events=("end",), resolve_entities=False,
no_network=True)
for _, el in ctx:
if el.tag == "DataOperation":
op = (el.text or "").strip()
elif el.tag == "Position":
uid = el.findtext("UniqueID")
amt = el.find("./TotalValue/Amount")
val = float(amt.text) if amt is not None and amt.text else 0.0
pct = float(el.findtext("TotalPercentage") or 0.0)
if uid:
out[uid] = (val, pct)
el.clear()
while el.getprevious() is not None:
del el.getparent()[0]
return op, out
def main() -> int:
args = [a for a in sys.argv[1:] if a != "--json"]
as_json = "--json" in sys.argv
if len(args) != 2:
print("usage: delta_diff.py <old.xml> <new.xml> [--json]",
file=sys.stderr)
return 2
op_old, a = index(args[0])
op_new, b = index(args[1])
added = sorted(set(b) - set(a))
removed = sorted(set(a) - set(b))
changed = sorted(u for u in (set(a) & set(b))
if abs(a[u][0] - b[u][0]) > 0.005
or abs(a[u][1] - b[u][1]) > 0.005)
unchanged = len(set(a) & set(b)) - len(changed)
if as_json:
print(json.dumps({
"old": {"file": args[0], "dataOperation": op_old, "positions": len(a)},
"new": {"file": args[1], "dataOperation": op_new, "positions": len(b)},
"added": added, "removed": removed,
"changed": [{"id": u, "old": a[u], "new": b[u]} for u in changed],
"unchanged": unchanged,
}, indent=2))
else:
print(f"old: {args[0]} DataOperation={op_old} positions={len(a)}")
print(f"new: {args[1]} DataOperation={op_new} positions={len(b)}")
print(f"added : {len(added)} {added[:10]}{' ...' if len(added) > 10 else ''}")
print(f"removed : {len(removed)} {removed[:10]}{' ...' if len(removed) > 10 else ''}")
print(f"changed : {len(changed)}")
for u in changed[:10]:
print(f" {u}: value {a[u][0]:.2f} -> {b[u][0]:.2f}, "
f"pct {a[u][1]:.4f} -> {b[u][1]:.4f}")
if len(changed) > 10:
print(" ...")
print(f"unchanged: {unchanged}")
return 0 if not (added or removed or changed) else 1
if __name__ == "__main__":
sys.exit(main())