add bill sponsors and dedupe script
This commit is contained in:
parent
c1e456aedc
commit
b012392fc9
55
add_bill_sponsors.py
Normal file
55
add_bill_sponsors.py
Normal file
@ -0,0 +1,55 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from neo4j import GraphDatabase
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
# Get Neo4j connection info from environment variables
|
||||
NEO4J_URI = os.getenv('NEO4J_URI')
|
||||
NEO4J_USER = os.getenv('NEO4J_USER')
|
||||
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
|
||||
|
||||
# Function to connect to the Neo4j database and fetch the "sponsors.0.bioguideId" property of all Bill nodes
|
||||
def get_sponsors_bioguide_id(uri, user, password):
|
||||
driver = GraphDatabase.driver(uri, auth=(user, password))
|
||||
|
||||
with driver.session() as session:
|
||||
# Cypher query to get all Bill nodes and match Person node with the same bioguideId
|
||||
query = """
|
||||
MATCH (b:Bill)
|
||||
WITH b, b.`sponsors.0.bioguideId` AS bioguideId
|
||||
OPTIONAL MATCH (p:Person {bioguideId: bioguideId})
|
||||
WHERE NOT EXISTS((b)-[:SPONSORED]->(p))
|
||||
RETURN b.`sponsors.0.bioguideId` AS sponsorBioguideId, p
|
||||
"""
|
||||
|
||||
result = session.run(query)
|
||||
|
||||
for record in result:
|
||||
sponsor_bioguide_id = record['sponsorBioguideId']
|
||||
matched_person = record['p']
|
||||
|
||||
# Print the value of sponsors.0.bioguideId and the matched Person node
|
||||
# print(f"Value of sponsors.0.bioguideId: {sponsor_bioguide_id}")
|
||||
if matched_person:
|
||||
person_properties = matched_person.items()
|
||||
print("Matched Person Node:")
|
||||
for key, value in person_properties:
|
||||
print(f"{key}: {value}")
|
||||
|
||||
# Create the SPONSORED relationship
|
||||
create_relationship_query = """
|
||||
MATCH (b:Bill), (p:Person {bioguideId: $bioguideId})
|
||||
WHERE b.`sponsors.0.bioguideId` = $sponsorBioguideId
|
||||
CREATE (p)-[:SPONSORED]->(b)
|
||||
"""
|
||||
session.run(create_relationship_query, bioguideId=sponsor_bioguide_id, sponsorBioguideId=sponsor_bioguide_id)
|
||||
print("Created SPONSORED relationship.")
|
||||
else:
|
||||
continue # print("No matching Person node found.")
|
||||
|
||||
driver.close()
|
||||
|
||||
# Call the function with your connection info
|
||||
get_sponsors_bioguide_id(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
|
55
dedupe_relationships.py
Normal file
55
dedupe_relationships.py
Normal file
@ -0,0 +1,55 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from neo4j import GraphDatabase
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
# Get Neo4j connection info from environment variables
|
||||
NEO4J_URI = os.getenv('NEO4J_URI')
|
||||
NEO4J_USER = os.getenv('NEO4J_USER')
|
||||
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
|
||||
|
||||
# Function to connect to the Neo4j database and remove duplicate relationships while keeping one of each type
|
||||
def remove_duplicate_relationships(uri, user, password):
|
||||
driver = GraphDatabase.driver(uri, auth=(user, password))
|
||||
|
||||
with driver.session() as session:
|
||||
# Cypher query to find nodes with duplicate relationships
|
||||
identify_query = """
|
||||
MATCH (n)-[r]->(m)
|
||||
WITH type(r) AS relType, n AS startNode, m AS endNode, collect(id(r)) AS relIds
|
||||
WHERE size(relIds) > 1
|
||||
RETURN relType, startNode, endNode, relIds
|
||||
ORDER BY size(relIds) DESC
|
||||
"""
|
||||
|
||||
result = session.run(identify_query)
|
||||
|
||||
for record in result:
|
||||
relationship_type = record['relType']
|
||||
start_node = record['startNode']
|
||||
end_node = record['endNode']
|
||||
rel_ids = record['relIds']
|
||||
|
||||
# Keep only one relationship of each type and delete the rest
|
||||
if len(rel_ids) > 1:
|
||||
keep_rel_id = rel_ids[0]
|
||||
delete_rel_ids = [rid for rid in rel_ids if rid != keep_rel_id]
|
||||
|
||||
# Delete extra relationships
|
||||
for del_rel_id in delete_rel_ids:
|
||||
session.run(
|
||||
"MATCH ()-[r]->() WHERE id(r) = $relId DELETE r",
|
||||
relId=del_rel_id
|
||||
)
|
||||
|
||||
print(f"Deleted relationship with ID: {del_rel_id}")
|
||||
|
||||
print("Remaining Relationship ID:", keep_rel_id)
|
||||
print("\n")
|
||||
|
||||
driver.close()
|
||||
|
||||
# Call the function with your connection info
|
||||
remove_duplicate_relationships(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
|
Loading…
Reference in New Issue
Block a user