@@ -34,7 +34,7 @@ type Engine interface {
3434 List (page int ) SearchResult
3535 String () string
3636 // parseSingleMovie: parses the result of a colly HTMLElement and returns a movie
37- parseSingleMovie (el * colly.HTMLElement ) (Movie , error )
37+ parseSingleMovie (el * colly.HTMLElement , movieIndex int ) (Movie , error )
3838
3939 // getParseAttrs : get the attributes to use to parse a returned soup
4040 // the first return string is the part of the html to be parsed e.g `body`, `main`
@@ -52,14 +52,11 @@ func Scrape(engine Engine) ([]Movie, error) {
5252 // Cache responses to prevent multiple download of pages
5353 // even if the collector is restarted
5454 colly .CacheDir ("./gophie_cache" ),
55- // colly.Async(true),
55+ colly .Async (true ),
5656 // colly.Debugger(&debug.LogDebugger{}),
5757 )
5858 // Another collector for download Links
59- downloadLinkCollector := colly .NewCollector (
60- colly .CacheDir ("./gophie-cache" ),
61- colly .Async (true )
62- )
59+ downloadLinkCollector := c .Clone ()
6360
6461 var movies = make (map [string ]* Movie )
6562
@@ -70,21 +67,22 @@ func Scrape(engine Engine) ([]Movie, error) {
7067 if err != nil {
7168 log .Fatal (err )
7269 }
70+ movieIndex := 0
7371 c .OnHTML (main , func (e * colly.HTMLElement ) {
7472 e .ForEach (article , func (_ int , el * colly.HTMLElement ) {
75- movie , err := engine .parseSingleMovie (el )
73+ movie , err := engine .parseSingleMovie (el , movieIndex )
7674 if err != nil {
7775 log .Errorf ("%v could not be parsed" , movie )
7876 } else {
7977 // Using DownloadLink as key to movie makes it unique
80- movies [movie .DownloadLink .String ()] = & movie
81- // downloadLinkCollector.Visit(movie.DownloadLink.String())
78+ m := strconv .Itoa (movieIndex )
79+ movies [m ] = & movie
80+ ctx := colly .NewContext ()
81+ ctx .Put ("movieIndex" , m )
82+ downloadLinkCollector .Request ("GET" , movie .DownloadLink .String (), nil , ctx , nil )
83+ movieIndex ++
8284 }
8385 })
84-
85- for _ , movie := range movies {
86- downloadLinkCollector .Visit (movie .DownloadLink .String ())
87- }
8886 })
8987
9088 c .OnRequest (func (r * colly.Request ) {
@@ -101,25 +99,25 @@ func Scrape(engine Engine) ([]Movie, error) {
10199 // movie details when we need it
102100 downloadLinkCollector .OnRequest (func (r * colly.Request ) {
103101 r .Headers .Set ("Accept" , "text/html,application/xhtml+xml,application/xml" )
104- if movie , ok := movies [r .URL .String ()]; ok {
105- log .Debugf ("Retrieving Download Link %v\n " , movie .DownloadLink )
106- }
102+ movie := getMovieFromMovies (r , movies )
103+ log .Debugf ("Retrieving Download Link %v\n " , movie .DownloadLink )
107104 })
108105
109106 // If Response Content Type is not Text, Abort the Request to prevent fully downloading the
110107 // body in case of other types like mp4
111108 downloadLinkCollector .OnResponseHeaders (func (r * colly.Response ) {
109+ log .Infof ("%s" , r .Headers )
112110 if ! strings .Contains (r .Headers .Get ("Content-Type" ), "text" ) {
111+ log .Errorf ("Response %s is not text/html. Aborting request" , r .Request .URL )
113112 r .Request .Abort ()
114- log .Debugf ("Response %s is not text/html. Aborting request" , r .Request .URL )
115113 }
116114 })
117115
118116 downloadLinkCollector .OnResponse (func (r * colly.Response ) {
119- log . Debug (r .Request . URL . String )
120- // movie := movies[r.Request.URL.String()]
121- // log.Infof("%s %v %s", r.Request.URL.String(), movie.DownloadLink, movie.Title )
122- // log.Debugf("Retrieved Download Link %v \n", movie.DownloadLink)
117+ movie := getMovieFromMovies (r .Request , movies )
118+ log . Infof ( "Movie on Response %v" , movie )
119+ // prettyPrint([]Movie{* movie} )
120+ // log.Debugf("Retrieved Download Page %s \n", movie.DownloadLink.String() )
123121 })
124122
125123 c .Visit (engine .getParseURL ().String ())
@@ -242,15 +240,11 @@ func getMovieIndexFromCtx(r *colly.Request) int {
242240 return movieIndex
243241}
244242
245- // Get Movie from a URL
246- func getMovieFromMovies (url string , movies map [string ]* Movie ) * Movie {
247- if _ , ok := movies [url ]; ok {
248- return movies [url ]
249- }
250- for _ , movie := range movies {
251- if (* movie ).DownloadLink .String () == url {
252- return movie
253- }
243+ // Get Movie from a Context
244+ func getMovieFromMovies (r * colly.Request , movies map [string ]* Movie ) * Movie {
245+ movieIndex := r .Ctx .Get ("movieIndex" )
246+ if _ , ok := movies [movieIndex ]; ok {
247+ return movies [movieIndex ]
254248 }
255249 return & Movie {}
256250}
0 commit comments