77 "net/url"
88 "strconv"
99 "strings"
10+ "sync"
1011
1112 "github.com/gocolly/colly/v2"
1213 // "github.com/gocolly/colly/v2/debug"
@@ -43,7 +44,14 @@ type Engine interface {
4344 getParseAttrs () (string , string , error )
4445
4546 // parseSingleMovie: parses the result of a colly HTMLElement and returns a movie
46- updateDownloadProps (downloadCollector * colly.Collector , movies map [string ]* Movie )
47+ updateDownloadProps (downloadCollector * colly.Collector , scrapedMovies * scraped )
48+ }
49+
50+ // All scraped movies are stored here. Since accessed on different goroutine
51+ // Mutex to prevent Data Race
52+ type scraped struct {
53+ movies map [string ]* Movie
54+ sync.Mutex
4755}
4856
4957// Scrape : Parse queries a url and return results
@@ -58,10 +66,10 @@ func Scrape(engine Engine) ([]Movie, error) {
5866 // Another collector for download Links
5967 downloadLinkCollector := c .Clone ()
6068
61- var movies = make (map [string ]* Movie )
69+ scrapedMovies := scraped { movies : make (map [string ]* Movie )}
6270
6371 // Any Extras setup for downloads using can be specified in the function
64- engine .updateDownloadProps (downloadLinkCollector , movies )
72+ engine .updateDownloadProps (downloadLinkCollector , & scrapedMovies )
6573
6674 main , article , err := engine .getParseAttrs ()
6775 if err != nil {
@@ -75,8 +83,10 @@ func Scrape(engine Engine) ([]Movie, error) {
7583 log .Errorf ("%v could not be parsed" , movie )
7684 } else {
7785 // Using DownloadLink as key to movie makes it unique
86+ scrapedMovies .Lock ()
87+ defer scrapedMovies .Unlock ()
7888 m := strconv .Itoa (movieIndex )
79- movies [m ] = & movie
89+ scrapedMovies . movies [m ] = & movie
8090 ctx := colly .NewContext ()
8191 ctx .Put ("movieIndex" , m )
8292 downloadLinkCollector .Request ("GET" , movie .DownloadLink .String (), nil , ctx , nil )
@@ -99,35 +109,32 @@ func Scrape(engine Engine) ([]Movie, error) {
99109 // movie details when we need it
100110 downloadLinkCollector .OnRequest (func (r * colly.Request ) {
101111 r .Headers .Set ("Accept" , "text/html,application/xhtml+xml,application/xml" )
102- movie := getMovieFromMovies (r , movies )
103- log .Debugf ("Retrieving Download Link %v \n " , movie .DownloadLink )
112+ movie := getMovieFromMovies (r , & scrapedMovies )
113+ log .Debugf ("Retrieving Download Link %s \n " , movie .Title )
104114 })
105115
106116 // If Response Content Type is not Text, Abort the Request to prevent fully downloading the
107117 // body in case of other types like mp4
108118 downloadLinkCollector .OnResponseHeaders (func (r * colly.Response ) {
109- log .Infof ("%s" , r .Headers )
110119 if ! strings .Contains (r .Headers .Get ("Content-Type" ), "text" ) {
111- log .Errorf ("Response %s is not text/html. Aborting request" , r .Request .URL )
120+ log .Debugf ("Response %s is not text/html. Aborting request" , r .Request .URL )
112121 r .Request .Abort ()
113122 }
114123 })
115124
116125 downloadLinkCollector .OnResponse (func (r * colly.Response ) {
117- movie := getMovieFromMovies (r .Request , movies )
118- log .Infof ("Movie on Response %v" , movie )
119- // prettyPrint([]Movie{*movie})
120- // log.Debugf("Retrieved Download Page %s\n", movie.DownloadLink.String())
126+ movie := getMovieFromMovies (r .Request , & scrapedMovies )
127+ log .Debugf ("Retrieved Download Page %s\n " , movie .Title )
121128 })
122129
123130 c .Visit (engine .getParseURL ().String ())
124131 c .Wait ()
125132 downloadLinkCollector .Wait ()
126133
127134 // Create a List of Movies
128- v := make ([]Movie , 0 , len (movies ))
135+ v := make ([]Movie , 0 , len (scrapedMovies . movies ))
129136
130- for _ , value := range movies {
137+ for _ , value := range scrapedMovies . movies {
131138 v = append (v , * value )
132139 }
133140 prettyPrint (v )
@@ -241,10 +248,12 @@ func getMovieIndexFromCtx(r *colly.Request) int {
241248}
242249
243250// Get Movie from a Context
244- func getMovieFromMovies (r * colly.Request , movies map [ string ] * Movie ) * Movie {
251+ func getMovieFromMovies (r * colly.Request , scrapedMovies * scraped ) * Movie {
245252 movieIndex := r .Ctx .Get ("movieIndex" )
246- if _ , ok := movies [movieIndex ]; ok {
247- return movies [movieIndex ]
253+ scrapedMovies .Lock ()
254+ defer scrapedMovies .Unlock ()
255+ if _ , ok := scrapedMovies .movies [movieIndex ]; ok {
256+ return scrapedMovies .movies [movieIndex ]
248257 }
249258 return & Movie {}
250259}
0 commit comments