Archive for September, 2024
Golang: Parse IMDb database
by admin on Sep.01, 2024, under News
The website IMDb (Internet Movie Data Base) provides datasets for non-commercial use. On a daily basis they provide updated download links to TSV (tab separated values) files which can be downloaded from this link.
We’ll now write a simple Go program to parse the data provided from IMDb:
package main import ( "compress/gzip" "encoding/csv" "fmt" "os" ) func main() { file, err := os.Open("title.basics.tsv.gz") if err != nil { fmt.Println("Error opening file:", err) return } defer file.Close() gz, err := gzip.NewReader(file) if err != nil { fmt.Println("Error decompressing file:", err) return } defer gz.Close() tsvReader := csv.NewReader(gz) tsvReader.Comma = '\t' // TSV files are tab-separated tsvReader.LazyQuotes = true // Some fields might have unescaped quotes // Iterate through records looking for a specific title ID for { record, err := tsvReader.Read() if err != nil { fmt.Println("Finished reading file or encountered error:", err) break } // Assuming the TSV columns are: tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres if record[0] == "tt0111161" { // Example: "The Shawshank Redemption" fmt.Println("Title ID:", record[0]) fmt.Println("Title Type:", record[1]) fmt.Println("Primary Title:", record[2]) fmt.Println("Original Title:", record[3]) fmt.Println("Is Adult:", record[4]) fmt.Println("Start Year:", record[5]) fmt.Println("End Year:", record[6]) fmt.Println("Runtime Minutes:", record[7]) fmt.Println("Genres:", record[8]) break } } }
Running this program will output something like:
Title ID: tt0111161 Title Type: movie Primary Title: The Shawshank Redemption Original Title: The Shawshank Redemption Is Adult: 0 Start Year: 1994 End Year: \N Runtime Minutes: 142 Genres: Drama