Skip to content

Commit 5a7fbe4

Browse files
authored
Merge pull request #362 from readium/pdf-no-meta
Add a flag to avoid the extraction of PDF metadata (except the number of pages) if they are considered too bad.
2 parents 6156324 + ed2803a commit 5a7fbe4

File tree

6 files changed

+151
-68
lines changed

6 files changed

+151
-68
lines changed

encrypt/process_encrypt.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ type Publication struct {
5050

5151
// ProcessEncryption encrypts a publication
5252
// inputPath must contain a processable file extension.
53-
func ProcessEncryption(contentID, contentKey, inputPath, tempRepo, outputRepo, storageRepo, storageURL, storageFilename string, extractCover bool) (*Publication, error) {
53+
func ProcessEncryption(contentID, contentKey, inputPath, tempRepo, outputRepo, storageRepo, storageURL, storageFilename string, extractCover, pdfNoMeta bool) (*Publication, error) {
5454

5555
var pub Publication
5656
pub.OutputRepo = outputRepo
@@ -141,7 +141,7 @@ func ProcessEncryption(contentID, contentKey, inputPath, tempRepo, outputRepo, s
141141
case ".epub":
142142
err = processEPUB(&pub, encrypter, contentKey)
143143
case ".pdf":
144-
err = processPDF(&pub, encrypter, contentKey)
144+
err = processPDF(&pub, encrypter, contentKey, pdfNoMeta)
145145
case ".lpf":
146146
err = processLPF(&pub, encrypter, contentKey)
147147
case ".audiobook", ".divina", ".webpub", ".rpf":
@@ -415,7 +415,7 @@ func processEPUB(pub *Publication, encrypter crypto.Encrypter, contentKey string
415415
}
416416

417417
// processPDF wraps a PDF file inside a Readium Package and encrypts its resources
418-
func processPDF(pub *Publication, encrypter crypto.Encrypter, contentKey string) error {
418+
func processPDF(pub *Publication, encrypter crypto.Encrypter, contentKey string, pdfNoMeta bool) error {
419419

420420
log.Println("Process as PDF")
421421

@@ -425,7 +425,7 @@ func processPDF(pub *Publication, encrypter crypto.Encrypter, contentKey string)
425425

426426
// generate a temp Readium Package (rwpp) which embeds the PDF file
427427
// the first page of the PDF is extracted as a JPEG cover image
428-
rwpInfo, err := pack.BuildRPFFromPDF(pub.InputPath, tmpPackagePath, coverPath)
428+
rwpInfo, err := pack.BuildRPFFromPDF(pub.InputPath, tmpPackagePath, coverPath, pdfNoMeta)
429429
// will will remove the tmp file even if an error is returned
430430
defer os.Remove(tmpPackagePath)
431431
// process error
@@ -521,6 +521,11 @@ func buildEncryptedRPF(pub *Publication, encrypter crypto.Encrypter, contentKey
521521
}
522522
defer reader.Close()
523523

524+
// set the title from the manifest if not already set
525+
if pub.Title == "" {
526+
pub.Title = reader.Title()
527+
}
528+
524529
// set the target content type from the conformance type in the manifest
525530
ext := filepath.Ext(pub.FileName)
526531
switch reader.ConformsTo() {

frontend/webpublication/webpublication.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ func encryptPublication(inputPath string, pub *Publication, pubManager Publicati
108108
// FIXME: work on a direct storage of the output file.
109109
outputRepo := config.Config.FrontendServer.EncryptedRepository
110110
empty := ""
111-
notification, err := encrypt.ProcessEncryption(empty, empty, inputPath, empty, outputRepo, empty, empty, empty, false)
111+
notification, err := encrypt.ProcessEncryption(empty, empty, inputPath, empty, outputRepo, empty, empty, empty, false, false)
112112
if err != nil {
113113
return err
114114
}

lcpencrypt/lcpencrypt.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ func showHelpAndExit() {
2121

2222
fmt.Println("lcpencrypt encrypts a publication using the LCP DRM.")
2323
fmt.Println("-input source epub/pdf/lpf/audiobook file locator (file system or http GET)")
24+
fmt.Println("-provider publication provider (URI)")
2425
fmt.Println("-contentid optional, content identifier; if omitted a uuid is generated")
2526
fmt.Println("-storage optional, target location of the encrypted publication, without filename. File system path or s3 bucket")
2627
fmt.Println("-url optional, base url associated with the storage, without filename")
2728
fmt.Println("-filename optional, file name of the encrypted publication; if omitted, contentid is used")
2829
fmt.Println("-temp optional, working folder for temporary files. If not set, the current directory will be used.")
2930
fmt.Println("-cover optional, boolean, indicates that a cover should be generated")
31+
fmt.Println("-pdfnometa optional, boolean, indicates that PDF metadata must not be extracted")
3032
fmt.Println("-contentkey optional, base64 encoded content key; if omitted a random content key is generated")
3133
fmt.Println("-lcpsv optional, URL, host name of the License Server to be notified; syntax http://username:[email protected]")
3234
fmt.Println("-v2 optional, boolean, indicates communication with a License Server v2")
@@ -48,15 +50,16 @@ func exitWithError(context string, err error) {
4850
}
4951

5052
func main() {
51-
providerUri := flag.String("provider", "", "optional, provider URI")
5253
inputPath := flag.String("input", "", "source epub/pdf/lpf file locator (file system or http GET)")
54+
providerUri := flag.String("provider", "", "optional, publication provider (URI)")
5355
contentid := flag.String("contentid", "", "optional, content identifier; if omitted, a uuid is generated")
5456
storageRepo := flag.String("storage", "", "optional, target location of the encrypted publication, without filename. File system path or s3 bucket")
5557
storageURL := flag.String("url", "", "optional, base url associated with the storage, without filename")
5658
storageFilename := flag.String("filename", "", "optional, file name of the encrypted publication; if omitted, contentid is used")
5759
outputRepo := flag.String("output", "", "optional, target folder of encrypted publications")
5860
tempRepo := flag.String("temp", "", "optional, working folder for temporary files")
5961
cover := flag.Bool("cover", false, "optional, boolean, indicates that covers must be generated when possible")
62+
pdfnometa := flag.Bool("pdfnometa", false, "optional, boolean, indicates that PDF metadata must not be extracted")
6063
contentkey := flag.String("contentkey", "", "optional, base64 encoded content key; if omitted a random content key is generated")
6164
lcpsv := flag.String("lcpsv", "", "URL, host name of the License server which is notified; the preferred syntax is http://username:[email protected]")
6265
v2 := flag.Bool("v2", false, "optional, boolean, indicates a v2 License serve")
@@ -101,7 +104,7 @@ func main() {
101104
}
102105

103106
// encrypt the publication
104-
publication, err := encrypt.ProcessEncryption(*contentid, *contentkey, *inputPath, *tempRepo, *outputRepo, *storageRepo, *storageURL, *storageFilename, *cover)
107+
publication, err := encrypt.ProcessEncryption(*contentid, *contentkey, *inputPath, *tempRepo, *outputRepo, *storageRepo, *storageURL, *storageFilename, *cover, *pdfnometa)
105108
if err != nil {
106109
exitWithError("Error processing a publication", err)
107110
}

pack/rwppackage.go

Lines changed: 72 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"log"
1818
"net/url"
1919
"os"
20-
"text/template"
2120

2221
"github.com/gen2brain/go-fitz"
2322
"github.com/readium/readium-lcp-server/rwpm"
@@ -229,6 +228,11 @@ func (reader *RPFReader) ConformsTo() string {
229228
return reader.manifest.Metadata.ConformsTo
230229
}
231230

231+
// Title returns the title of the manifest
232+
func (reader *RPFReader) Title() string {
233+
return reader.manifest.Metadata.Title["und"]
234+
}
235+
232236
// Close closes a Readium Package Reader
233237
func (reader *RPFReader) Close() error {
234238
return reader.zipArchive.Close()
@@ -397,7 +401,7 @@ func OpenRPF(name string) (*RPFReader, error) {
397401

398402
// BuildRPFFromPDF builds a Readium Package (rwpp) which embeds a PDF file and a cover
399403
// the cover file extracted from the PDF is not deleted by this function
400-
func BuildRPFFromPDF(inputPath, packagePath, coverPath string) (RWPInfo, error) {
404+
func BuildRPFFromPDF(inputPath, packagePath, coverPath string, pdfNoMeta bool) (RWPInfo, error) {
401405

402406
var rwpInfo RWPInfo
403407

@@ -456,69 +460,71 @@ func BuildRPFFromPDF(inputPath, packagePath, coverPath string) (RWPInfo, error)
456460
return rwpInfo, err
457461
}
458462

459-
// inject a Readium manifest into the zip output
460-
manifest := `
461-
{
462-
"@context": "https://readium.org/webpub-manifest/context.jsonld"
463-
,
464-
"metadata": {
465-
"@type": "http://schema.org/Book",
466-
"conformsTo": "https://readium.org/webpub-manifest/profiles/pdf",
467-
"title": "{{.Title}}",
468-
"author": "{{.Author}}",
469-
"subject": "{{.Subject}}",
470-
"numberOfPages": {{.NumPages}}
471-
},
472-
"readingOrder": [
473-
{
474-
"href": "publication.pdf", "title": "publication", "type": "application/pdf"
475-
}
476-
],
477-
"resources": [
478-
{
479-
"rel": "cover", "href": "cover.jpg", "type": "image/jpeg"
480-
}
481-
]
482-
}
483-
`
484-
485463
manifestWriter, err := zipWriter.Create(ManifestLocation)
486464
if err != nil {
487465
return rwpInfo, err
488466
}
489467

490-
tmpl, err := template.New("manifest").Parse(manifest)
491-
if err != nil {
492-
return rwpInfo, err
468+
// create simple manifest object
469+
var manifest rwpm.Publication
470+
471+
manifest.Context.Add("https://readium.org/webpub-manifest/context.jsonld")
472+
manifest.Metadata.Type = "http://schema.org/Book"
473+
manifest.Metadata.ConformsTo = "https://readium.org/webpub-manifest/profiles/pdf"
474+
475+
// number of pages is needed to display progress in the reader
476+
manifest.Metadata.NumberOfPages = rwpInfo.NumPages
477+
478+
// PDF metadata can be so bad that we may want to ignore them
479+
if pdfNoMeta {
480+
// we still need a title
481+
filename := filepath.Base(inputPath)
482+
rwpInfo.Title = strings.TrimSuffix(filename, filepath.Ext(filename)) // default title
483+
// remove underscores, hyphens, dots which are frequent in PDF file names
484+
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, "_", " ")
485+
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, "-", " ")
486+
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, ".", " ")
487+
rwpInfo.Title = strings.TrimSpace(rwpInfo.Title)
488+
manifest.Metadata.Title.Set("und", rwpInfo.Title)
489+
// add PDF metadata to the manifest
490+
} else {
491+
// remove underscores, hyphens, stars which are frequent in PDF titles
492+
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, "_", " ")
493+
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, "-", " ")
494+
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, "*", " ")
495+
rwpInfo.Title = strings.TrimSpace(rwpInfo.Title)
496+
if rwpInfo.Title == "" {
497+
rwpInfo.Title = "No Title Available" // default title
498+
}
499+
manifest.Metadata.Title.Set("und", rwpInfo.Title)
500+
// there is zero or one author/subject in the PDF metadata
501+
if len(rwpInfo.Author) != 0 {
502+
manifest.Metadata.Author.AddName(rwpInfo.Author[0])
503+
}
504+
if len(rwpInfo.Subject) != 0 {
505+
manifest.Metadata.Subject.Add(rwpm.Subject{Name: rwpInfo.Subject[0]})
506+
}
493507
}
494508

495-
// remove underscores, hyphens, stars which are frequent in PDF titles
496-
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, "_", " ")
497-
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, "-", " ")
498-
rwpInfo.Title = strings.ReplaceAll(rwpInfo.Title, "*", " ")
499-
rwpInfo.Title = strings.TrimSpace(rwpInfo.Title)
500-
if rwpInfo.Title == "" {
501-
rwpInfo.Title = "No Title Available" // default title
502-
}
503-
// there is zero or one author/subject in the PDF metadata
504-
if len(rwpInfo.Author) == 0 {
505-
rwpInfo.Author = []string{"unknown"}
506-
}
507-
if len(rwpInfo.Subject) == 0 {
508-
rwpInfo.Subject = []string{"unknown"}
509-
}
510-
params := struct {
511-
Title string
512-
Author string
513-
Subject string
514-
NumPages int
515-
}{
516-
Title: rwpInfo.Title,
517-
Author: rwpInfo.Author[0],
518-
Subject: rwpInfo.Subject[0],
519-
NumPages: rwpInfo.NumPages,
520-
}
521-
err = tmpl.Execute(manifestWriter, params)
509+
manifest.ReadingOrder = []rwpm.Link{
510+
{
511+
Href: "publication.pdf",
512+
Title: "publication",
513+
Type: "application/pdf",
514+
},
515+
}
516+
manifest.Resources = []rwpm.Link{
517+
{
518+
Rel: []string{"cover"},
519+
Href: "cover.jpg",
520+
Type: "image/jpeg",
521+
},
522+
}
523+
524+
// marshal and write manifest as JSON
525+
encoder := json.NewEncoder(manifestWriter)
526+
encoder.SetIndent("", " ")
527+
err = encoder.Encode(manifest)
522528
if err != nil {
523529
return rwpInfo, err
524530
}
@@ -550,8 +556,14 @@ func extractRWPInfo(inputPath, coverPath string) (RWPInfo, error) {
550556
// extract PDF metadata and number of pages
551557
metadata := doc.Metadata()
552558
rwpInfo.Title = cleanNulls(metadata["title"])
553-
rwpInfo.Author = []string{cleanNulls(metadata["author"])}
554-
rwpInfo.Subject = []string{cleanNulls(metadata["subject"])}
559+
author := cleanNulls(metadata["author"])
560+
if author != "" {
561+
rwpInfo.Author = []string{author}
562+
}
563+
subject := cleanNulls(metadata["subject"])
564+
if subject != "" {
565+
rwpInfo.Subject = []string{subject}
566+
}
555567
rwpInfo.NumPages = doc.NumPage()
556568

557569
if coverPath == "" {

pack/rwppackage_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package pack
66

77
import (
88
"bytes"
9+
"encoding/json"
910
"testing"
1011
"time"
1112

@@ -118,4 +119,66 @@ func TestSetMetadata(t *testing.T) {
118119
manifest.Metadata.ReadingProgression = "ltr"
119120
manifest.Metadata.Subject.Add(rwpm.Subject{Name: "software", Scheme: "iptc", Code: "04003000"})
120121

122+
manifest.ReadingOrder = []rwpm.Link{
123+
{
124+
Href: "chapter1.html",
125+
Type: "application/xhtml+xml",
126+
},
127+
{
128+
Href: "chapter2.html",
129+
Type: "application/xhtml+xml",
130+
},
131+
}
132+
manifest.Resources = []rwpm.Link{
133+
{
134+
Rel: []string{"stylesheet"},
135+
Href: "style.css",
136+
Type: "text/css",
137+
},
138+
}
139+
140+
// simulate writing and reading back the manifest
141+
var buf bytes.Buffer
142+
encoder := json.NewEncoder(&buf)
143+
err := encoder.Encode(&manifest)
144+
if err != nil {
145+
t.Fatalf("Could not encode manifest to JSON, %s", err)
146+
}
147+
148+
// verify metadata
149+
150+
if manifest.Metadata.Identifier != "id1" {
151+
t.Errorf("Expected identifier to be 'id1', got '%s'", manifest.Metadata.Identifier)
152+
}
153+
if title := manifest.Metadata.Title["fr"]; title != "title" {
154+
t.Errorf("Expected title to be 'title', got '%s'", title)
155+
}
156+
if manifest.Metadata.Description != "description" {
157+
t.Errorf("Expected description to be 'description', got '%s'", manifest.Metadata.Description)
158+
}
159+
if manifest.Metadata.Published.String() != "2020-03-05" {
160+
t.Errorf("Expected published to be '2020-03-05', got '%s'", manifest.Metadata.Published.String())
161+
}
162+
if manifest.Metadata.Duration != 120 {
163+
t.Errorf("Expected duration to be 120, got %f", manifest.Metadata.Duration)
164+
}
165+
if len(manifest.Metadata.Author) != 1 || manifest.Metadata.Author[0].Name["und"] != "Laurent" {
166+
t.Errorf("Expected author to be 'Laurent', got '%v'", manifest.Metadata.Author)
167+
}
168+
if len(manifest.Metadata.Language) != 1 || manifest.Metadata.Language[0] != "fr" {
169+
t.Errorf("Expected language to be 'fr', got '%v'", manifest.Metadata.Language)
170+
}
171+
if manifest.Metadata.ReadingProgression != "ltr" {
172+
t.Errorf("Expected reading progression to be 'ltr', got '%s'", manifest.Metadata.ReadingProgression)
173+
}
174+
if len(manifest.Metadata.Subject) != 1 || manifest.Metadata.Subject[0].Name != "software" {
175+
t.Errorf("Expected subject to be 'software', got '%v'", manifest.Metadata.Subject)
176+
}
177+
if len(manifest.ReadingOrder) != 2 || manifest.ReadingOrder[0].Href != "chapter1.html" {
178+
t.Errorf("Expected first reading order item to be 'chapter1.html', got '%v'", manifest.ReadingOrder)
179+
}
180+
if len(manifest.ReadingOrder) != 2 || manifest.ReadingOrder[1].Href != "chapter2.html" {
181+
t.Errorf("Expected second reading order item to be 'chapter2.html', got '%v'", manifest.ReadingOrder)
182+
}
183+
121184
}

rwpm/metadata.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func (d Date) MarshalJSON() ([]byte, error) {
111111
func (d Date) String() string {
112112

113113
date := time.Time(d)
114-
return date.Format("\"2006-01-02\"")
114+
return date.Format("2006-01-02")
115115
}
116116

117117
// Meta is a generic structure for other metadata

0 commit comments

Comments
 (0)