User Tools

Site Tools


postgresqlimdb

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Next revision
Previous revision
postgresqlimdb [2024/03/08 10:15] – created z0hpvkpostgresqlimdb [2025/03/08 22:24] (current) – external edit 127.0.0.1
Line 1: Line 1:
 ====== IMDB Database ====== ====== IMDB Database ======
  
-The import of the data was done using the script here: [[https://gist.github.com/IllusiveMilkman/2a7a6614193c74804db7650f6d3c2bd2 | Import IMDB Datasets into Postgres]]+The import of the data was based on the script here: [[https://gist.github.com/IllusiveMilkman/2a7a6614193c74804db7650f6d3c2bd2 | Import IMDB Datasets into Postgres]]\\ 
 +Run the below script as the postgres user to avoid permission issues.\\ 
 +<code BASH> 
 +printf "Removing and create folder \n" 
 +rm -rf imdb-datasets/ 
 +mkdir imdb-datasets/ 
 + 
 +printf "Download datasets from https://datasets.imdbws.com \n" 
 +cd imdb-datasets 
 +curl -O https://datasets.imdbws.com/name.basics.tsv.gz 
 +curl -O https://datasets.imdbws.com/title.akas.tsv.gz 
 +curl -O https://datasets.imdbws.com/title.basics.tsv.gz 
 +curl -O https://datasets.imdbws.com/title.crew.tsv.gz 
 +curl -O https://datasets.imdbws.com/title.episode.tsv.gz 
 +curl -O https://datasets.imdbws.com/title.principals.tsv.gz 
 +curl -O https://datasets.imdbws.com/title.ratings.tsv.gz 
 + 
 +printf "Unzipping datasets... \n" 
 +gzip -dk *.gz 
 +cd .. 
 + 
 +printf "Creating Database \n" 
 +psql -d 'postgres' -c "DROP DATABASE IF EXISTS imdb;" 
 +psql -d 'postgres' -c "CREATE DATABASE imdb;" 
 + 
 +printf "Creating tables in imdb database \n" 
 +psql -d imdb -c "CREATE table title_ratings (tconst VARCHAR(10),average_rating NUMERIC,num_votes integer);" 
 +psql -d imdb -c "CREATE TABLE name_basics (nconst varchar(10), primaryName text, birthYear smallint, deathYear smallint, primaryProfession text, knownForTitles text );" 
 +psql -d imdb -c "CREATE TABLE title_akas (titleId TEXT, ordering INTEGER, title TEXT, region TEXT, language TEXT, types TEXT, attributes TEXT, isOriginalTitle BOOLEAN);" 
 +psql -d imdb -c "CREATE TABLE title_basics (tconst TEXT, titleType TEXT, primaryTitle TEXT, originalTitle TEXT, isAdult BOOLEAN, startYear SMALLINT, endYear SMALLINT, runtimeMinutes INTEGER, genres TEXT);" 
 +psql -d imdb -c "CREATE TABLE title_crew (tconst TEXT, directors TEXT, writers TEXT);" 
 +psql -d imdb -c "CREATE TABLE title_episode (const TEXT, parentTconst TEXT, seasonNumber TEXT, episodeNumber TEXT);" 
 +psql -d imdb -c "CREATE TABLE title_principals (tconst TEXT, ordering INTEGER, nconst TEXT, category TEXT, job TEXT, characters TEXT);" 
 + 
 +printf "Inserting data into tables \n" 
 +psql -d imdb -c "COPY title_ratings FROM '$(pwd)/imdb-datasets/title.ratings.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" 
 +psql -d imdb -c "COPY name_basics FROM '$(pwd)/imdb-datasets/name.basics.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" 
 +psql -d imdb -c "COPY title_akas FROM '$(pwd)/imdb-datasets/title.akas.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" 
 +psql -d imdb -c "COPY title_basics FROM '$(pwd)/imdb-datasets/title.basics.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" 
 +psql -d imdb -c "COPY title_crew FROM '$(pwd)/imdb-datasets/title.crew.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" 
 +psql -d imdb -c "COPY title_episode FROM '$(pwd)/imdb-datasets/title.episode.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" 
 +psql -d imdb -c "COPY title_principals FROM '$(pwd)/imdb-datasets/title.principals.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" 
 +</code> 
 + 
 +Next run the below SQL script to 
 +  * Create Primary Keys 
 +  * Fix Data Issues 
 +  * Create Foreign Keys 
 +  * Convert Column Lists to Arrays 
 + 
 +<code SQL> 
 +ALTER TABLE title_ratings ADD CONSTRAINT title_ratings_pk PRIMARY KEY (tconst); 
 +ALTER TABLE title_akas ADD CONSTRAINT title_akas_pk PRIMARY KEY (titleid, ordering); 
 +ALTER TABLE title_basics ADD CONSTRAINT title_basics_pk PRIMARY KEY (tconst); 
 +ALTER TABLE title_crew ADD CONSTRAINT title_crew_pk PRIMARY KEY (tconst); 
 +ALTER TABLE title_episode ADD CONSTRAINT title_episode_pk PRIMARY KEY (const); 
 +ALTER TABLE title_principals ADD CONSTRAINT title_principals_pk PRIMARY KEY (tconst, ordering); 
 +ALTER TABLE name_basics ADD CONSTRAINT name_basics_pk PRIMARY KEY (nconst); 
 + 
 +INSERT INTO name_basics (nconst,primaryname) VALUES ('nm2007716','Unknown'), ('nm10111415','Unknown'), 
 +                                                    ('nm15015978','Unknown'), ('nm13369741','Unknown'); 
 + 
 +ALTER TABLE title_ratings ADD CONSTRAINT title_ratings_fk FOREIGN KEY (tconst) REFERENCES title_basics (tconst); 
 +ALTER TABLE title_episode ADD CONSTRAINT title_episode_fk FOREIGN KEY (parenttconst) REFERENCES title_basics (tconst); 
 +ALTER TABLE title_principals ADD CONSTRAINT title_principals_fk FOREIGN KEY (nconst) REFERENCES name_basics (nconst); 
 + 
 +ALTER TABLE name_basics ALTER primaryprofession TYPE text[] USING string_to_array(primaryprofession, ','); 
 +ALTER TABLE name_basics ALTER knownfortitles TYPE text[] USING string_to_array(knownfortitles, ','); 
 +ALTER TABLE title_basics ALTER genres TYPE text[] USING string_to_array(genres, ','); 
 +ALTER TABLE title_crew ALTER directors TYPE text[] USING string_to_array(directors, ','); 
 +ALTER TABLE title_crew ALTER writers TYPE text[] USING string_to_array(writers, ','); 
 +</code>
postgresqlimdb.1709892920.txt.gz · Last modified: 2025/03/08 22:23 (external edit)