Skip to content
Advertisement

Primary key constraint gets removed when creating postgres table from pandas dataframe

I am trying to create few tables in Postgres from pandas dataframe but I am kept getting this error.

 psycopg2.errors.InvalidForeignKey: there is no unique constraint matching given keys for referenced table "titles"

After looking into this problem for hours, i finally found that when I am inserting the data into parent table from pandas dataframe, the primary key constraint gets removed for some reasons and due to that I am getting this error when trying to refernece it from another table.

But I am not having this problem when I am using pgAdmin4 to create the table and inserting few rows of data manually.

enter image description here enter image description here

you can see when I created the tables using pgAdmin, the primary key and foreign keys are getting created as expected and I have no problem with it.

But when I try to insert the data from pandas dataframe using psycopg2 library, the primary key is not getting created.

I Can’t able to understand why is this happening.

enter image description here

The code I am using to create the tables –

# function for faster data insertion
def psql_insert_copy(table, conn, keys, data_iter):
    """
    Execute SQL statement inserting data

    Parameters
    ----------
    table : pandas.io.sql.SQLTable
    conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
    keys : list of str
        Column names
    data_iter : Iterable that iterates the values to be inserted
    """
    # gets a DBAPI connection that can provide a cursor
    dbapi_conn = conn.connection
    with dbapi_conn.cursor() as cur:
        s_buf = StringIO()
        writer = csv.writer(s_buf)
        writer.writerows(data_iter)
        s_buf.seek(0)

        columns = ", ".join('"{}"'.format(k) for k in keys)
        if table.schema:
            table_name = "{}.{}".format(table.schema, table.name)
        else:
            table_name = table.name

        sql = "COPY {} ({}) FROM STDIN WITH CSV".format(table_name, columns)
        cur.copy_expert(sql=sql, file=s_buf)


def create_titles_table():
    # connect to the database
    conn = psycopg2.connect(
        dbname="imdb",
        user="postgres",
        password=os.environ.get("DB_PASSWORD"),
        host="localhost",
    )
    # create a cursor
    c = conn.cursor()

    print()
    print("Creating titles table...")
    c.execute(
        """CREATE TABLE IF NOT EXISTS titles(
                title_id  TEXT PRIMARY KEY,
                title_type  TEXT, 
                primary_title  TEXT, 
                original_title  TEXT,
                is_adult  INT, 
                start_year  REAL, 
                end_year  REAL, 
                runtime_minutes REAL
                )
    """
    )
    # commit changes
    conn.commit()

    # read the title data
    df = load_data("title.basics.tsv")
    # replace N with nan
    df.replace("\N", np.nan, inplace=True)
    # rename columns
    df.rename(
        columns={
            "tconst": "title_id",
            "titleType": "title_type",
            "primaryTitle": "primary_title",
            "originalTitle": "original_title",
            "isAdult": "is_adult",
            "startYear": "start_year",
            "endYear": "end_year",
            "runtimeMinutes": "runtime_minutes",
        },
        inplace=True,
    )
    # drop the genres column
    title_df = df.drop("genres", axis=1)
    # convert the data types from str to numeric
    title_df["start_year"] = pd.to_numeric(title_df["start_year"], errors="coerce")
    title_df["end_year"] = pd.to_numeric(title_df["end_year"], errors="coerce")
    title_df["runtime_minutes"] = pd.to_numeric(
        title_df["runtime_minutes"], errors="coerce"
    )

    # create SQLAlchemy engine
    engine = create_engine(
        "postgresql://postgres:" + os.environ["DB_PASSWORD"] + "@localhost:5432/imdb"
    )
    # insert the data into titles table
    title_df.to_sql(
        "titles", engine, if_exists="replace", index=False, method=psql_insert_copy
    )

    # commit changes
    conn.commit()
    # close cursor
    c.close()
    # close the connection
    conn.close()
    print("Completed!")
    print()


def create_genres_table():

    # connect to the database
    conn = psycopg2.connect(
        dbname="imdb",
        user="postgres",
        password=os.environ.get("DB_PASSWORD"),
        host="localhost",
    )

    # create a cursor
    c = conn.cursor()

    print()
    print("Creating genres table...")
    c.execute(
        """CREATE TABLE IF NOT EXISTS genres(
            title_id  TEXT NOT NULL, 
            genre  TEXT,
            FOREIGN KEY (title_id) REFERENCES titles(title_id)
            )
    """
    )
    # commit changes
    conn.commit()

    # read the data
    df = load_data("title.basics.tsv")
    # replace N with nan
    df.replace("\N", np.nan, inplace=True)
    # rename columns
    df.rename(columns={"tconst": "title_id", "genres": "genre"}, inplace=True)
    # select only relevant columns
    genres_df = df[["title_id", "genre"]].copy()
    genres_df = genres_df.assign(genre=genres_df["genre"].str.split(",")).explode(
        "genre"
    )

    # create engine
    engine = create_engine(
        "postgresql://postgres:" + os.environ["DB_PASSWORD"] + "@localhost:5432/imdb"
    )

    # insert the data into genres table
    genres_df.to_sql(
        "genres", engine, if_exists="replace", index=False, method=psql_insert_copy
    )

    # commit changes
    conn.commit()
    # close cursor
    c.close()
    # close the connection
    conn.close()
    print("Completed!")
    print()

if __name__ == "__main__":

    print()
    print("Creating IMDB Database...")
    # connect to the database
    conn = psycopg2.connect(
        dbname="imdb",
        user="postgres",
        password=os.environ.get("DB_PASSWORD"),
        host="localhost",
    )

    # create the titles table
    create_titles_table()

    # create genres table
    create_genres_table()
    # close the connection
    conn.close()
    print("Done with Everything!")
    print()

Advertisement

Answer

I think the problem is to_sql(if_exists="replace"). Try using to_sql(if_exists="append") – my understanding is that “replace” drops the whole table and creates a new one with no constraints.

User contributions licensed under: CC BY-SA
2 People found this is helpful
Advertisement