Skip to content

Commit 378c05b

Browse files
committed
Move the core of the combine logic to be entirely in SQL
This shaves roughly 40% of the runtime off of pyca/cryptography's combine. A major limitation is that it currently doesn't work with in-memory sqlite databases, I'm not sure if there's a good way to do that.
1 parent d8f88c7 commit 378c05b

File tree

1 file changed

+118
-122
lines changed

1 file changed

+118
-122
lines changed

coverage/sqldata.py

Lines changed: 118 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,22 @@ def _wrapped(self: CoverageData, *args: Any, **kwargs: Any) -> Any:
122122
return _wrapped
123123

124124

125+
class NumbitsUnionAgg:
126+
"""SQLite aggregate function for computing union of numbits."""
127+
128+
def __init__(self) -> None:
129+
self.result = b""
130+
131+
def step(self, value: bytes) -> None:
132+
"""Process one value in the aggregation."""
133+
if value:
134+
self.result = numbits_union(self.result, value)
135+
136+
def finalize(self) -> bytes:
137+
"""Return the final aggregated result."""
138+
return self.result
139+
140+
125141
class CoverageData:
126142
"""Manages collected coverage data, including file storage.
127143
@@ -676,146 +692,126 @@ def update(
676692

677693
# Force the database we're writing to to exist before we start nesting contexts.
678694
self._start_using()
679-
680-
# Collector for all arcs, lines and tracers
681695
other_data.read()
682-
with other_data._connect() as con:
683-
# Get files data.
684-
with con.execute("select path from file") as cur:
685-
files = {path: map_path(path) for (path,) in cur}
686-
687-
# Get contexts data.
688-
with con.execute("select context from context") as cur:
689-
contexts = cur.fetchall()
690-
691-
# Get arc data.
692-
with con.execute(
693-
"select file.path, context.context, arc.fromno, arc.tono " +
694-
"from arc " +
695-
"inner join file on file.id = arc.file_id " +
696-
"inner join context on context.id = arc.context_id",
697-
) as cur:
698-
arcs = [
699-
(files[path], context, fromno, tono)
700-
for (path, context, fromno, tono) in cur
701-
]
702-
703-
# Get line data.
704-
with con.execute(
705-
"select file.path, context.context, line_bits.numbits " +
706-
"from line_bits " +
707-
"inner join file on file.id = line_bits.file_id " +
708-
"inner join context on context.id = line_bits.context_id",
709-
) as cur:
710-
lines: dict[tuple[str, str], bytes] = {}
711-
for path, context, numbits in cur:
712-
key = (files[path], context)
713-
if key in lines:
714-
numbits = numbits_union(lines[key], numbits)
715-
lines[key] = numbits
716-
717-
# Get tracer data.
718-
with con.execute(
719-
"select file.path, tracer " +
720-
"from tracer " +
721-
"inner join file on file.id = tracer.file_id",
722-
) as cur:
723-
tracers = {files[path]: tracer for (path, tracer) in cur}
696+
697+
# Ensure other_data has a properly initialized database
698+
with other_data._connect():
699+
pass
724700

725701
with self._connect() as con:
726702
assert con.con is not None
727703
con.con.isolation_level = "IMMEDIATE"
728704

729-
# Get all tracers in the DB. Files not in the tracers are assumed
730-
# to have an empty string tracer. Since Sqlite does not support
731-
# full outer joins, we have to make two queries to fill the
732-
# dictionary.
733-
with con.execute("select path from file") as cur:
734-
this_tracers = {path: "" for path, in cur}
735-
with con.execute(
736-
"select file.path, tracer from tracer " +
737-
"inner join file on file.id = tracer.file_id",
738-
) as cur:
739-
this_tracers.update({
740-
map_path(path): tracer
741-
for path, tracer in cur
742-
})
743-
744-
# Create all file and context rows in the DB.
745-
con.executemany_void(
746-
"insert or ignore into file (path) values (?)",
747-
[(file,) for file in files.values()],
748-
)
749-
with con.execute("select id, path from file") as cur:
750-
file_ids = {path: id for id, path in cur}
751-
self._file_map.update(file_ids)
752-
con.executemany_void(
753-
"insert or ignore into context (context) values (?)",
754-
contexts,
755-
)
756-
with con.execute("select id, context from context") as cur:
757-
context_ids = {context: id for id, context in cur}
758-
759-
# Prepare tracers and fail, if a conflict is found.
760-
# tracer_paths is used to ensure consistency over the tracer data
761-
# and tracer_map tracks the tracers to be inserted.
762-
tracer_map = {}
763-
for path in files.values():
764-
this_tracer = this_tracers.get(path)
765-
other_tracer = tracers.get(path, "")
766-
# If there is no tracer, there is always the None tracer.
767-
if this_tracer is not None and this_tracer != other_tracer:
705+
# Register functions for SQLite
706+
con.con.create_function("numbits_union", 2, numbits_union)
707+
con.con.create_function("map_path", 1, map_path)
708+
con.con.create_aggregate("numbits_union_agg", 1, NumbitsUnionAgg)
709+
710+
# Attach the other database
711+
con.execute_void("ATTACH DATABASE ? AS other_db", (other_data.data_filename(),))
712+
713+
# Check for tracer conflicts before proceeding
714+
with con.execute("""
715+
SELECT map_path(main.file.path),
716+
COALESCE(main.tracer.tracer, ''),
717+
COALESCE(other_db.tracer.tracer, '')
718+
FROM main.file
719+
LEFT JOIN main.tracer ON main.file.id = main.tracer.file_id
720+
INNER JOIN other_db.file ON map_path(main.file.path) = map_path(other_db.file.path)
721+
LEFT JOIN other_db.tracer ON other_db.file.id = other_db.tracer.file_id
722+
WHERE COALESCE(main.tracer.tracer, '') != COALESCE(other_db.tracer.tracer, '')
723+
""") as cur:
724+
conflicts = list(cur)
725+
if conflicts:
726+
path, this_tracer, other_tracer = conflicts[0]
768727
raise DataError(
769728
"Conflicting file tracer name for '{}': {!r} vs {!r}".format(
770729
path, this_tracer, other_tracer,
771730
),
772731
)
773-
tracer_map[path] = other_tracer
774732

775-
# Prepare arc and line rows to be inserted by converting the file
776-
# and context strings with integer ids. Then use the efficient
777-
# `executemany()` to insert all rows at once.
733+
# Insert missing files from other_db (with map_path applied)
734+
con.execute_void("""
735+
INSERT OR IGNORE INTO main.file (path)
736+
SELECT map_path(path) FROM other_db.file
737+
""")
778738

779-
if arcs:
780-
self._choose_lines_or_arcs(arcs=True)
739+
# Insert missing contexts from other_db
740+
con.execute_void("""
741+
INSERT OR IGNORE INTO main.context (context)
742+
SELECT context FROM other_db.context
743+
""")
781744

782-
arc_rows = [
783-
(file_ids[file], context_ids[context], fromno, tono)
784-
for file, context, fromno, tono in arcs
785-
]
745+
# Update file_map with any new files
746+
with con.execute("select id, path from file") as cur:
747+
self._file_map.update({path: id for id, path in cur})
786748

787-
# Write the combined data.
788-
con.executemany_void(
789-
"insert or ignore into arc " +
790-
"(file_id, context_id, fromno, tono) values (?, ?, ?, ?)",
791-
arc_rows,
792-
)
749+
with con.execute("""
750+
SELECT
751+
EXISTS(SELECT 1 FROM other_db.arc),
752+
EXISTS(SELECT 1 FROM other_db.line_bits)
753+
""") as cur:
754+
has_arcs, has_lines = cur.fetchone()
793755

794-
if lines:
756+
# Handle arcs if present in other_db
757+
if has_arcs:
758+
self._choose_lines_or_arcs(arcs=True)
759+
con.execute_void("""
760+
INSERT OR IGNORE INTO main.arc (file_id, context_id, fromno, tono)
761+
SELECT
762+
main_file.id,
763+
main_context.id,
764+
other_arc.fromno,
765+
other_arc.tono
766+
FROM other_db.arc AS other_arc
767+
INNER JOIN other_db.file AS other_file ON other_arc.file_id = other_file.id
768+
INNER JOIN other_db.context AS other_context ON other_arc.context_id = other_context.id
769+
INNER JOIN main.file AS main_file ON map_path(other_file.path) = main_file.path
770+
INNER JOIN main.context AS main_context ON other_context.context = main_context.context
771+
""")
772+
773+
# Handle line_bits if present in other_db
774+
if has_lines:
795775
self._choose_lines_or_arcs(lines=True)
796776

797-
for (file, context), numbits in lines.items():
798-
with con.execute(
799-
"select numbits from line_bits where file_id = ? and context_id = ?",
800-
(file_ids[file], context_ids[context]),
801-
) as cur:
802-
existing = list(cur)
803-
if existing:
804-
lines[(file, context)] = numbits_union(numbits, existing[0][0])
805-
806-
con.executemany_void(
807-
"insert or replace into line_bits " +
808-
"(file_id, context_id, numbits) values (?, ?, ?)",
809-
[
810-
(file_ids[file], context_ids[context], numbits)
811-
for (file, context), numbits in lines.items()
812-
],
813-
)
814-
815-
con.executemany_void(
816-
"insert or ignore into tracer (file_id, tracer) values (?, ?)",
817-
[(file_ids[filename], tracer) for filename, tracer in tracer_map.items()],
818-
)
777+
# Handle line_bits by aggregating other_db data by mapped target,
778+
# then inserting/updating
779+
con.execute_void("""
780+
INSERT OR REPLACE INTO main.line_bits (file_id, context_id, numbits)
781+
SELECT
782+
main_file.id,
783+
main_context.id,
784+
numbits_union(
785+
COALESCE((
786+
SELECT numbits FROM main.line_bits
787+
WHERE file_id = main_file.id AND context_id = main_context.id
788+
), X''),
789+
aggregated.combined_numbits
790+
)
791+
FROM (
792+
SELECT
793+
map_path(other_file.path) as mapped_path,
794+
other_context.context,
795+
numbits_union_agg(other_line_bits.numbits) as combined_numbits
796+
FROM other_db.line_bits AS other_line_bits
797+
INNER JOIN other_db.file AS other_file ON other_line_bits.file_id = other_file.id
798+
INNER JOIN other_db.context AS other_context ON other_line_bits.context_id = other_context.id
799+
GROUP BY map_path(other_file.path), other_context.context
800+
) AS aggregated
801+
INNER JOIN main.file AS main_file ON aggregated.mapped_path = main_file.path
802+
INNER JOIN main.context AS main_context ON aggregated.context = main_context.context
803+
""")
804+
805+
# Insert tracers from other_db (avoiding conflicts we already checked)
806+
con.execute_void("""
807+
INSERT OR IGNORE INTO main.tracer (file_id, tracer)
808+
SELECT
809+
main_file.id,
810+
other_tracer.tracer
811+
FROM other_db.tracer AS other_tracer
812+
INNER JOIN other_db.file AS other_file ON other_tracer.file_id = other_file.id
813+
INNER JOIN main.file AS main_file ON map_path(other_file.path) = main_file.path
814+
""")
819815

820816
if not self._no_disk:
821817
# Update all internal cache data.

0 commit comments

Comments
 (0)