add bill sponsors and dedupe script

This commit is contained in:
Moses Rolston 2025-03-14 07:17:25 -07:00
parent c1e456aedc
commit b012392fc9
2 changed files with 110 additions and 0 deletions

55
add_bill_sponsors.py Normal file
View File

@ -0,0 +1,55 @@
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
# Load environment variables from .env file
load_dotenv()
# Get Neo4j connection info from environment variables
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
# Function to connect to the Neo4j database and fetch the "sponsors.0.bioguideId" property of all Bill nodes
def get_sponsors_bioguide_id(uri, user, password):
driver = GraphDatabase.driver(uri, auth=(user, password))
with driver.session() as session:
# Cypher query to get all Bill nodes and match Person node with the same bioguideId
query = """
MATCH (b:Bill)
WITH b, b.`sponsors.0.bioguideId` AS bioguideId
OPTIONAL MATCH (p:Person {bioguideId: bioguideId})
WHERE NOT EXISTS((b)-[:SPONSORED]->(p))
RETURN b.`sponsors.0.bioguideId` AS sponsorBioguideId, p
"""
result = session.run(query)
for record in result:
sponsor_bioguide_id = record['sponsorBioguideId']
matched_person = record['p']
# Print the value of sponsors.0.bioguideId and the matched Person node
# print(f"Value of sponsors.0.bioguideId: {sponsor_bioguide_id}")
if matched_person:
person_properties = matched_person.items()
print("Matched Person Node:")
for key, value in person_properties:
print(f"{key}: {value}")
# Create the SPONSORED relationship
create_relationship_query = """
MATCH (b:Bill), (p:Person {bioguideId: $bioguideId})
WHERE b.`sponsors.0.bioguideId` = $sponsorBioguideId
CREATE (p)-[:SPONSORED]->(b)
"""
session.run(create_relationship_query, bioguideId=sponsor_bioguide_id, sponsorBioguideId=sponsor_bioguide_id)
print("Created SPONSORED relationship.")
else:
continue # print("No matching Person node found.")
driver.close()
# Call the function with your connection info
get_sponsors_bioguide_id(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

55
dedupe_relationships.py Normal file
View File

@ -0,0 +1,55 @@
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
# Load environment variables from .env file
load_dotenv()
# Get Neo4j connection info from environment variables
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
# Function to connect to the Neo4j database and remove duplicate relationships while keeping one of each type
def remove_duplicate_relationships(uri, user, password):
driver = GraphDatabase.driver(uri, auth=(user, password))
with driver.session() as session:
# Cypher query to find nodes with duplicate relationships
identify_query = """
MATCH (n)-[r]->(m)
WITH type(r) AS relType, n AS startNode, m AS endNode, collect(id(r)) AS relIds
WHERE size(relIds) > 1
RETURN relType, startNode, endNode, relIds
ORDER BY size(relIds) DESC
"""
result = session.run(identify_query)
for record in result:
relationship_type = record['relType']
start_node = record['startNode']
end_node = record['endNode']
rel_ids = record['relIds']
# Keep only one relationship of each type and delete the rest
if len(rel_ids) > 1:
keep_rel_id = rel_ids[0]
delete_rel_ids = [rid for rid in rel_ids if rid != keep_rel_id]
# Delete extra relationships
for del_rel_id in delete_rel_ids:
session.run(
"MATCH ()-[r]->() WHERE id(r) = $relId DELETE r",
relId=del_rel_id
)
print(f"Deleted relationship with ID: {del_rel_id}")
print("Remaining Relationship ID:", keep_rel_id)
print("\n")
driver.close()
# Call the function with your connection info
remove_duplicate_relationships(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)