I am trying to create few tables in Postgres from pandas dataframe but I am kept getting this error.
psycopg2.errors.InvalidForeignKey: there is no unique constraint matching given keys for referenced table "titles"
After looking into this problem for hours, i finally found that when I am inserting the data into parent table from pandas dataframe, the primary key constraint gets removed for some reasons and due to that I am getting this error when trying to refernece it from another table.
But I am not having this problem when I am using pgAdmin4 to create the table and inserting few rows of data manually.
you can see when I created the tables using pgAdmin, the primary key and foreign keys are getting created as expected and I have no problem with it.
But when I try to insert the data from pandas dataframe using psycopg2 library, the primary key is not getting created.
I Can’t able to understand why is this happening.
The code I am using to create the tables –
# function for faster data insertion
def psql_insert_copy(table, conn, keys, data_iter):
"""
Execute SQL statement inserting data
Parameters
----------
table : pandas.io.sql.SQLTable
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
keys : list of str
Column names
data_iter : Iterable that iterates the values to be inserted
"""
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ", ".join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = "{}.{}".format(table.schema, table.name)
else:
table_name = table.name
sql = "COPY {} ({}) FROM STDIN WITH CSV".format(table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
def create_titles_table():
# connect to the database
conn = psycopg2.connect(
dbname="imdb",
user="postgres",
password=os.environ.get("DB_PASSWORD"),
host="localhost",
)
# create a cursor
c = conn.cursor()
print()
print("Creating titles table...")
c.execute(
"""CREATE TABLE IF NOT EXISTS titles(
title_id TEXT PRIMARY KEY,
title_type TEXT,
primary_title TEXT,
original_title TEXT,
is_adult INT,
start_year REAL,
end_year REAL,
runtime_minutes REAL
)
"""
)
# commit changes
conn.commit()
# read the title data
df = load_data("title.basics.tsv")
# replace N with nan
df.replace("\N", np.nan, inplace=True)
# rename columns
df.rename(
columns={
"tconst": "title_id",
"titleType": "title_type",
"primaryTitle": "primary_title",
"originalTitle": "original_title",
"isAdult": "is_adult",
"startYear": "start_year",
"endYear": "end_year",
"runtimeMinutes": "runtime_minutes",
},
inplace=True,
)
# drop the genres column
title_df = df.drop("genres", axis=1)
# convert the data types from str to numeric
title_df["start_year"] = pd.to_numeric(title_df["start_year"], errors="coerce")
title_df["end_year"] = pd.to_numeric(title_df["end_year"], errors="coerce")
title_df["runtime_minutes"] = pd.to_numeric(
title_df["runtime_minutes"], errors="coerce"
)
# create SQLAlchemy engine
engine = create_engine(
"postgresql://postgres:" + os.environ["DB_PASSWORD"] + "@localhost:5432/imdb"
)
# insert the data into titles table
title_df.to_sql(
"titles", engine, if_exists="replace", index=False, method=psql_insert_copy
)
# commit changes
conn.commit()
# close cursor
c.close()
# close the connection
conn.close()
print("Completed!")
print()
def create_genres_table():
# connect to the database
conn = psycopg2.connect(
dbname="imdb",
user="postgres",
password=os.environ.get("DB_PASSWORD"),
host="localhost",
)
# create a cursor
c = conn.cursor()
print()
print("Creating genres table...")
c.execute(
"""CREATE TABLE IF NOT EXISTS genres(
title_id TEXT NOT NULL,
genre TEXT,
FOREIGN KEY (title_id) REFERENCES titles(title_id)
)
"""
)
# commit changes
conn.commit()
# read the data
df = load_data("title.basics.tsv")
# replace N with nan
df.replace("\N", np.nan, inplace=True)
# rename columns
df.rename(columns={"tconst": "title_id", "genres": "genre"}, inplace=True)
# select only relevant columns
genres_df = df[["title_id", "genre"]].copy()
genres_df = genres_df.assign(genre=genres_df["genre"].str.split(",")).explode(
"genre"
)
# create engine
engine = create_engine(
"postgresql://postgres:" + os.environ["DB_PASSWORD"] + "@localhost:5432/imdb"
)
# insert the data into genres table
genres_df.to_sql(
"genres", engine, if_exists="replace", index=False, method=psql_insert_copy
)
# commit changes
conn.commit()
# close cursor
c.close()
# close the connection
conn.close()
print("Completed!")
print()
if __name__ == "__main__":
print()
print("Creating IMDB Database...")
# connect to the database
conn = psycopg2.connect(
dbname="imdb",
user="postgres",
password=os.environ.get("DB_PASSWORD"),
host="localhost",
)
# create the titles table
create_titles_table()
# create genres table
create_genres_table()
# close the connection
conn.close()
print("Done with Everything!")
print()
Advertisement
Answer
I think the problem is to_sql(if_exists="replace")
. Try using to_sql(if_exists="append")
– my understanding is that “replace” drops the whole table and creates a new one with no constraints.