Skip to content

Commit 8e1826d

Browse files
committed
Normalized USFM coming out of and going into ParatextProjectDataProvider
1 parent 71f7016 commit 8e1826d

File tree

7 files changed

+130
-80
lines changed

7 files changed

+130
-80
lines changed

c-sharp/Projects/ParatextProjectDataProvider.cs

Lines changed: 93 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,33 +1018,74 @@ public void SendFullProjectUpdateEvent()
10181018

10191019
public string GetBookUsfm(VerseRef verseRef)
10201020
{
1021-
return GetFromScrText(
1021+
return GetUsfmFromScrText(
10221022
verseRef,
10231023
(ScrText scrText, VerseRef verseRef) => scrText.GetText(verseRef, false, true)
10241024
);
10251025
}
10261026

10271027
public string GetChapterUsfm(VerseRef verseRef)
10281028
{
1029-
return GetFromScrText(
1029+
return GetUsfmFromScrText(
10301030
verseRef,
10311031
(ScrText scrText, VerseRef verseRef) => scrText.GetText(verseRef, true, true)
10321032
);
10331033
}
10341034

10351035
public string GetVerseUsfm(VerseRef verseRef)
10361036
{
1037-
return GetFromScrText(
1037+
return GetUsfmFromScrText(
10381038
verseRef,
10391039
(ScrText scrText, VerseRef verseRef) =>
10401040
scrText.Parser.GetVerseUsfmText(FindMatchingVerseRefInScrText(verseRef, scrText))
10411041
);
10421042
}
10431043

1044+
/// <summary>
1045+
/// Copied from `ScrText.StandardizeCrLfsIfNecessary`. We need to do this when setting book USFM
1046+
/// because we do not go through `ScrText.PutText`, and we strip out CR.
1047+
///
1048+
/// Some programs (include cc which is used for mapin/mapout) strip out cr's.
1049+
/// Put them back in if missing. Also terminates with CR/LF
1050+
/// </summary>
1051+
private static string StandardizeCrLfsIfNecessary(string text)
1052+
{
1053+
text = text.Replace("\r", "").Replace("\n", "\r\n");
1054+
if (!text.EndsWith("\r\n", StringComparison.Ordinal))
1055+
text = text + "\r\n";
1056+
return text;
1057+
}
1058+
1059+
/// <summary>
1060+
/// Strip out the carriage returns from a string. This should be run on all USFM text going out
1061+
/// from this class.
1062+
///
1063+
/// We use just LF in Platform.Bible Scripture text so UsjReaderWriter can accurately convert
1064+
/// between USFM and USJ positions without knowing the USFM.
1065+
/// </summary>
1066+
/// <param name="text"></param>
1067+
/// <returns></returns>
1068+
private static string RemoveCarriageReturns(string text)
1069+
{
1070+
return text.Replace("\r", "");
1071+
}
1072+
10441073
public bool SetBookUsfm(VerseRef verseRef, string data)
10451074
{
10461075
verseRef.ChapterNum = 0;
10471076
var scrText = LocalParatextProjects.GetParatextProject(ProjectDetails.Metadata.Id);
1077+
1078+
// Make newlines have CRLF because Paratext 9.4 always does this regardless of operating
1079+
// system, and we want to match Paratext 9.4's whitespace.
1080+
// ScrText.PutText runs other private methods to standardize the text before saving to file
1081+
// as well. Maybe sometime we should see if we can get ScrText.PutBook created or something
1082+
// so we don't have to copy stuff here or have inconsistencies.
1083+
data = StandardizeCrLfsIfNecessary(data);
1084+
// Normalize the USFM before saving (note: this is now done twice when called from SetBookUsx,
1085+
// but that normalization is done before making sure everything is CRLF which does affect
1086+
// the normalization code, unfortunately. Could optimize)
1087+
data = UsfmToken.NormalizeUsfm(scrText, verseRef.BookNum, data);
1088+
10481089
RunWithinLock(
10491090
WriteScope.EntireProject(scrText),
10501091
_ =>
@@ -1077,6 +1118,15 @@ public bool SetChapterUsfm(VerseRef verseRef, string data)
10771118
try
10781119
{
10791120
var scrText = LocalParatextProjects.GetParatextProject(ProjectDetails.Metadata.Id);
1121+
1122+
// Make newlines have CRLF because Paratext 9.4 always does this regardless of operating
1123+
// system, and we want to match Paratext 9.4's whitespace.
1124+
data = StandardizeCrLfsIfNecessary(data);
1125+
// Normalize the USFM before saving (note: this is now done twice when called from SetChapterUsx,
1126+
// but that normalization is done before making sure everything is CRLF which does affect
1127+
// the normalization code, unfortunately. Could optimize)
1128+
data = UsfmToken.NormalizeUsfm(scrText, verseRef.BookNum, data);
1129+
10801130
RunWithinLock(
10811131
WriteScope.EntireProject(scrText),
10821132
writeLock =>
@@ -1198,6 +1248,15 @@ public string GetVersePlainText(VerseRef verseRef)
11981248

11991249
#region Private helper methods
12001250

1251+
/// <summary>
1252+
/// Helper function to make it convenient to get text data from ScrText and normalize it so it
1253+
/// is ready to be used.
1254+
/// </summary>
1255+
/// <param name="verseRef">Verse reference at which to get the text data</param>
1256+
/// <param name="getTextFromScrText">Function to get the text from ScrText. If you want to get USFM from the ScrText, use `GetUsfmFromScrText` instead</param>
1257+
/// <returns></returns>
1258+
/// <exception cref="MissingBookException">If the requested book is missing in the ScrText</exception>
1259+
/// <exception cref="InvalidDataException">If the project was not found</exception>
12011260
private string GetFromScrText(
12021261
VerseRef verseRef,
12031262
Func<ScrText, VerseRef, string> getTextFromScrText
@@ -1220,6 +1279,37 @@ Func<ScrText, VerseRef, string> getTextFromScrText
12201279
}
12211280
}
12221281

1282+
/// <summary>
1283+
/// Helper function to make it convenient to get USFM data from ScrText and normalize it so it
1284+
/// is ready to be used.
1285+
/// </summary>
1286+
/// <param name="verseRef">Verse reference at which to get the USFM data</param>
1287+
/// <param name="getTextFromScrText">Function to get the USFM from ScrText. If you want to get other kinds of text from the ScrText like USX, use `GetFromScrText` instead</param>
1288+
/// <returns></returns>
1289+
/// <exception cref="MissingBookException">If the requested book is missing in the ScrText</exception>
1290+
/// <exception cref="InvalidDataException">If the project was not found</exception>
1291+
private string GetUsfmFromScrText(
1292+
VerseRef verseRef,
1293+
Func<ScrText, VerseRef, string> getTextFromScrText
1294+
)
1295+
{
1296+
return GetFromScrText(
1297+
verseRef,
1298+
(scrText, verseRef) =>
1299+
{
1300+
// Always normalize the USFM and remove CR before giving it out. This way,
1301+
// UsjReaderWriter can accurately convert between USFM and USJ positions without knowing
1302+
// the USFM
1303+
var usfm = UsfmToken.NormalizeUsfm(
1304+
scrText,
1305+
verseRef.BookNum,
1306+
getTextFromScrText(scrText, verseRef)
1307+
);
1308+
return RemoveCarriageReturns(usfm);
1309+
}
1310+
);
1311+
}
1312+
12231313
/// <summary>
12241314
/// In a given ScrText, find the VerseRef that best matches the provided VerseRef. If no
12251315
/// good match is found, this returns the original VerseRef passed in since ScrText might still

lib/platform-bible-utils/src/scripture/usj-reader-writer-test-data/testUSFM-2SA-1-locations.ts

Lines changed: 24 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -609,31 +609,11 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
609609
},
610610
},
611611
},
612-
{
613-
usfmVerseLocation: {
614-
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
615-
offset: 67,
616-
},
617-
usjContent: {
618-
node: {
619-
type: 'chapter',
620-
marker: 'c',
621-
number: '1',
622-
altnumber: '1 ca',
623-
pubnumber: '1 cp',
624-
sid: '2SA 1',
625-
},
626-
documentLocation: {
627-
jsonPath: "$.content[4]['number']",
628-
propertyOffset: 2,
629-
},
630-
},
631-
},
632612
// begin altnumber - first example of an attribute marker
633613
{
634614
usfmVerseLocation: {
635615
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
636-
offset: 68,
616+
offset: 67,
637617
},
638618
usjContent: {
639619
node: {
@@ -653,7 +633,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
653633
{
654634
usfmVerseLocation: {
655635
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
656-
offset: 69,
636+
offset: 68,
657637
},
658638
usjContent: {
659639
node: {
@@ -674,7 +654,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
674654
{
675655
usfmVerseLocation: {
676656
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
677-
offset: 70,
657+
offset: 69,
678658
},
679659
usjContent: {
680660
node: {
@@ -695,7 +675,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
695675
{
696676
usfmVerseLocation: {
697677
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
698-
offset: 71,
678+
offset: 70,
699679
},
700680
usjContent: {
701681
node: {
@@ -716,7 +696,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
716696
{
717697
usfmVerseLocation: {
718698
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
719-
offset: 72,
699+
offset: 71,
720700
},
721701
usjContent: {
722702
node: {
@@ -736,7 +716,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
736716
{
737717
usfmVerseLocation: {
738718
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
739-
offset: 73,
719+
offset: 72,
740720
},
741721
usjContent: {
742722
node: {
@@ -756,7 +736,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
756736
{
757737
usfmVerseLocation: {
758738
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
759-
offset: 74,
739+
offset: 73,
760740
},
761741
usjContent: {
762742
node: {
@@ -776,7 +756,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
776756
{
777757
usfmVerseLocation: {
778758
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
779-
offset: 75,
759+
offset: 74,
780760
},
781761
usjContent: {
782762
node: {
@@ -796,7 +776,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
796776
{
797777
usfmVerseLocation: {
798778
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
799-
offset: 76,
779+
offset: 75,
800780
},
801781
usjContent: {
802782
node: {
@@ -817,7 +797,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
817797
{
818798
usfmVerseLocation: {
819799
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
820-
offset: 77,
800+
offset: 76,
821801
},
822802
usjContent: {
823803
node: {
@@ -838,7 +818,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
838818
{
839819
usfmVerseLocation: {
840820
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
841-
offset: 78,
821+
offset: 77,
842822
},
843823
usjContent: {
844824
node: {
@@ -859,7 +839,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
859839
{
860840
usfmVerseLocation: {
861841
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
862-
offset: 79,
842+
offset: 78,
863843
},
864844
usjContent: {
865845
node: {
@@ -880,7 +860,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
880860
{
881861
usfmVerseLocation: {
882862
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
883-
offset: 80,
863+
offset: 79,
884864
},
885865
usjContent: {
886866
node: {
@@ -902,7 +882,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
902882
{
903883
usfmVerseLocation: {
904884
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
905-
offset: 81,
885+
offset: 80,
906886
},
907887
usjContent: {
908888
node: {
@@ -922,7 +902,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
922902
{
923903
usfmVerseLocation: {
924904
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
925-
offset: 82,
905+
offset: 81,
926906
},
927907
usjContent: {
928908
node: {
@@ -943,7 +923,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
943923
{
944924
usfmVerseLocation: {
945925
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
946-
offset: 83,
926+
offset: 82,
947927
},
948928
usjContent: {
949929
node: {
@@ -964,7 +944,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
964944
{
965945
usfmVerseLocation: {
966946
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
967-
offset: 84,
947+
offset: 83,
968948
},
969949
usjContent: {
970950
node: {
@@ -985,7 +965,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
985965
{
986966
usfmVerseLocation: {
987967
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
988-
offset: 85,
968+
offset: 84,
989969
},
990970
usjContent: {
991971
node: {
@@ -1005,7 +985,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
1005985
{
1006986
usfmVerseLocation: {
1007987
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
1008-
offset: 86,
988+
offset: 85,
1009989
},
1010990
usjContent: {
1011991
node: {
@@ -1025,7 +1005,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
10251005
{
10261006
usfmVerseLocation: {
10271007
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
1028-
offset: 87,
1008+
offset: 86,
10291009
},
10301010
usjContent: {
10311011
node: {
@@ -1045,7 +1025,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
10451025
{
10461026
usfmVerseLocation: {
10471027
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
1048-
offset: 88,
1028+
offset: 87,
10491029
},
10501030
usjContent: {
10511031
node: {
@@ -1065,7 +1045,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
10651045
{
10661046
usfmVerseLocation: {
10671047
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
1068-
offset: 89,
1048+
offset: 88,
10691049
},
10701050
usjContent: {
10711051
node: {
@@ -1086,7 +1066,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
10861066
{
10871067
usfmVerseLocation: {
10881068
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
1089-
offset: 90,
1069+
offset: 89,
10901070
},
10911071
usjContent: {
10921072
node: {
@@ -1102,7 +1082,7 @@ export const testUSFM2SaCh1Locations: LocationUsfmAndUsj[] = [
11021082
{
11031083
usfmVerseLocation: {
11041084
verseRef: { book: '2SA', chapterNum: 1, verseNum: 0 },
1105-
offset: 196,
1085+
offset: 195,
11061086
},
11071087
usjContent: {
11081088
node: {

lib/platform-bible-utils/src/scripture/usj-reader-writer-test-data/testUSFM-2SA-1.usfm

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
\toc3 2Sam
33
\toc2 2 Sam
44
\toc1 2 Samuel
5-
\c 1
6-
\ca 1 ca\ca*
5+
\c 1 \ca 1 ca\ca*
76
\cp 1 cp
87
\s1 This chapter and the next two chapters have lots of challenging USFM test markers and such in them.
98
\p

0 commit comments

Comments
 (0)