Archive for September, 2024
Golang: Parse IMDb database
by admin on Sep.01, 2024, under News
The website IMDb (Internet Movie Data Base) provides datasets for non-commercial use. On a daily basis they provide updated download links to TSV (tab separated values) files which can be downloaded from this link.
We’ll now write a simple Go program to parse the data provided from IMDb:
package main
import (
"compress/gzip"
"encoding/csv"
"fmt"
"os"
)
func main() {
file, err := os.Open("title.basics.tsv.gz")
if err != nil {
fmt.Println("Error opening file:", err)
return
}
defer file.Close()
gz, err := gzip.NewReader(file)
if err != nil {
fmt.Println("Error decompressing file:", err)
return
}
defer gz.Close()
tsvReader := csv.NewReader(gz)
tsvReader.Comma = '\t' // TSV files are tab-separated
tsvReader.LazyQuotes = true // Some fields might have unescaped quotes
// Iterate through records looking for a specific title ID
for {
record, err := tsvReader.Read()
if err != nil {
fmt.Println("Finished reading file or encountered error:", err)
break
}
// Assuming the TSV columns are: tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres
if record[0] == "tt0111161" { // Example: "The Shawshank Redemption"
fmt.Println("Title ID:", record[0])
fmt.Println("Title Type:", record[1])
fmt.Println("Primary Title:", record[2])
fmt.Println("Original Title:", record[3])
fmt.Println("Is Adult:", record[4])
fmt.Println("Start Year:", record[5])
fmt.Println("End Year:", record[6])
fmt.Println("Runtime Minutes:", record[7])
fmt.Println("Genres:", record[8])
break
}
}
}
package main import ( "compress/gzip" "encoding/csv" "fmt" "os" ) func main() { file, err := os.Open("title.basics.tsv.gz") if err != nil { fmt.Println("Error opening file:", err) return } defer file.Close() gz, err := gzip.NewReader(file) if err != nil { fmt.Println("Error decompressing file:", err) return } defer gz.Close() tsvReader := csv.NewReader(gz) tsvReader.Comma = '\t' // TSV files are tab-separated tsvReader.LazyQuotes = true // Some fields might have unescaped quotes // Iterate through records looking for a specific title ID for { record, err := tsvReader.Read() if err != nil { fmt.Println("Finished reading file or encountered error:", err) break } // Assuming the TSV columns are: tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres if record[0] == "tt0111161" { // Example: "The Shawshank Redemption" fmt.Println("Title ID:", record[0]) fmt.Println("Title Type:", record[1]) fmt.Println("Primary Title:", record[2]) fmt.Println("Original Title:", record[3]) fmt.Println("Is Adult:", record[4]) fmt.Println("Start Year:", record[5]) fmt.Println("End Year:", record[6]) fmt.Println("Runtime Minutes:", record[7]) fmt.Println("Genres:", record[8]) break } } }
package main import ( "compress/gzip" "encoding/csv" "fmt" "os" ) func main() { file, err := os.Open("title.basics.tsv.gz") if err != nil { fmt.Println("Error opening file:", err) return } defer file.Close() gz, err := gzip.NewReader(file) if err != nil { fmt.Println("Error decompressing file:", err) return } defer gz.Close() tsvReader := csv.NewReader(gz) tsvReader.Comma = '\t' // TSV files are tab-separated tsvReader.LazyQuotes = true // Some fields might have unescaped quotes // Iterate through records looking for a specific title ID for { record, err := tsvReader.Read() if err != nil { fmt.Println("Finished reading file or encountered error:", err) break } // Assuming the TSV columns are: tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres if record[0] == "tt0111161" { // Example: "The Shawshank Redemption" fmt.Println("Title ID:", record[0]) fmt.Println("Title Type:", record[1]) fmt.Println("Primary Title:", record[2]) fmt.Println("Original Title:", record[3]) fmt.Println("Is Adult:", record[4]) fmt.Println("Start Year:", record[5]) fmt.Println("End Year:", record[6]) fmt.Println("Runtime Minutes:", record[7]) fmt.Println("Genres:", record[8]) break } } }
Running this program will output something like:
Title ID: tt0111161
Title Type: movie
Primary Title: The Shawshank Redemption
Original Title: The Shawshank Redemption
Is Adult: 0
Start Year: 1994
End Year: \N
Runtime Minutes: 142
Genres: Drama
Title ID: tt0111161 Title Type: movie Primary Title: The Shawshank Redemption Original Title: The Shawshank Redemption Is Adult: 0 Start Year: 1994 End Year: \N Runtime Minutes: 142 Genres: Drama
Title ID: tt0111161 Title Type: movie Primary Title: The Shawshank Redemption Original Title: The Shawshank Redemption Is Adult: 0 Start Year: 1994 End Year: \N Runtime Minutes: 142 Genres: Drama