policymap/dedupe_relationships.py

56 lines
1.9 KiB
Python
Raw Normal View History

2025-03-14 07:17:25 -07:00
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
# Load environment variables from .env file
load_dotenv()
# Get Neo4j connection info from environment variables
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
# Function to connect to the Neo4j database and remove duplicate relationships while keeping one of each type
def remove_duplicate_relationships(uri, user, password):
driver = GraphDatabase.driver(uri, auth=(user, password))
with driver.session() as session:
# Cypher query to find nodes with duplicate relationships
identify_query = """
MATCH (n)-[r]->(m)
WITH type(r) AS relType, n AS startNode, m AS endNode, collect(id(r)) AS relIds
WHERE size(relIds) > 1
RETURN relType, startNode, endNode, relIds
ORDER BY size(relIds) DESC
"""
result = session.run(identify_query)
for record in result:
relationship_type = record['relType']
start_node = record['startNode']
end_node = record['endNode']
rel_ids = record['relIds']
# Keep only one relationship of each type and delete the rest
if len(rel_ids) > 1:
keep_rel_id = rel_ids[0]
delete_rel_ids = [rid for rid in rel_ids if rid != keep_rel_id]
# Delete extra relationships
for del_rel_id in delete_rel_ids:
session.run(
"MATCH ()-[r]->() WHERE id(r) = $relId DELETE r",
relId=del_rel_id
)
print(f"Deleted relationship with ID: {del_rel_id}")
print("Remaining Relationship ID:", keep_rel_id)
print("\n")
driver.close()
# Call the function with your connection info
remove_duplicate_relationships(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)