Skip to content

Commit 2028ecc

Browse files
authored
Merge pull request #741 from thoth-pub/feature/strict-jats-parsing
Feature/strict jats parsing
2 parents 11c1f25 + 99793d2 commit 2028ecc

9 files changed

Lines changed: 750 additions & 124 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

77
## [Unreleased]
8+
### Fixed
9+
- [741](https://github.com/thoth-pub/thoth/pull/741) - Harden JATS rich-text handling by rejecting malformed or nested markup and abstract line breaks on write, and normalise Crossref abstract output to avoid invalid nested `jats:p` and `jats:break` elements
810

911
## [[1.0.2]](https://github.com/thoth-pub/thoth/releases/tag/v1.0.2) - 2026-04-03
1012
### Security

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

thoth-api/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ jsonwebtoken = { version = "10.3.0", optional = true }
4646
juniper = { version = "0.16.1", features = ["chrono", "schema-language", "uuid"] }
4747
lazy_static = "1.5.0"
4848
pulldown-cmark = "0.13.0"
49+
quick-xml = "0.36"
4950
rand = { version = "0.9.0", optional = true }
5051
regex = "1.11.1"
5152
scraper = "0.20.0"
@@ -64,4 +65,4 @@ log = "0.4.26"
6465

6566
[dev-dependencies]
6667
fs2 = "0.4.3"
67-
tokio = { version = "1.44", features = ["macros", "rt"] }
68+
tokio = { version = "1.44", features = ["macros", "rt", "rt-multi-thread"] }

thoth-api/src/graphql/tests.rs

Lines changed: 49 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,11 +1195,19 @@ fn patch_title(title: &Title) -> PatchTitle {
11951195
}
11961196
}
11971197

1198+
fn append_to_jats_paragraph_content(content: &str, suffix: &str) -> String {
1199+
if let Some((head, tail)) = content.rsplit_once("</p>") {
1200+
format!("{head}{suffix}</p>{tail}")
1201+
} else {
1202+
format!("{content}{suffix}")
1203+
}
1204+
}
1205+
11981206
fn patch_abstract(abstract_item: &Abstract) -> PatchAbstract {
11991207
PatchAbstract {
12001208
abstract_id: abstract_item.abstract_id,
12011209
work_id: abstract_item.work_id,
1202-
content: format!("{} Updated", abstract_item.content),
1210+
content: append_to_jats_paragraph_content(&abstract_item.content, " Updated"),
12031211
locale_code: abstract_item.locale_code,
12041212
abstract_type: abstract_item.abstract_type,
12051213
canonical: abstract_item.canonical,
@@ -1210,7 +1218,7 @@ fn patch_biography(biography: &Biography) -> PatchBiography {
12101218
PatchBiography {
12111219
biography_id: biography.biography_id,
12121220
contribution_id: biography.contribution_id,
1213-
content: format!("{} Updated", biography.content),
1221+
content: append_to_jats_paragraph_content(&biography.content, " Updated"),
12141222
canonical: biography.canonical,
12151223
locale_code: biography.locale_code,
12161224
}
@@ -2551,7 +2559,7 @@ query LinkedRelations($reviewId: Uuid!, $endorsementId: Uuid!) {
25512559
}
25522560

25532561
#[test]
2554-
fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() {
2562+
fn graphql_markup_mutations_accept_valid_jatsxml_but_reject_breaks_and_markup_like_plain_text() {
25552563
let (_guard, pool) = test_db::setup_test_db();
25562564
let schema = create_schema();
25572565
let superuser = test_db::test_superuser("user-jats-xml-mutations");
@@ -2597,12 +2605,17 @@ fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() {
25972605
);
25982606

25992607
let abstract_item = Abstract::from_id(pool.as_ref(), &seed.abstract_short_id).unwrap();
2600-
update_with_data_and_markup(
2601-
&schema,
2602-
&context,
2603-
"updateAbstract",
2604-
"PatchAbstract",
2605-
"abstractId",
2608+
let abstract_query = r#"
2609+
mutation UpdateAbstract($data: PatchAbstract!, $markup: MarkupFormat!) {
2610+
updateAbstract(data: $data, markupFormat: $markup) {
2611+
abstractId
2612+
}
2613+
}
2614+
"#;
2615+
let mut abstract_vars = Variables::new();
2616+
insert_var(
2617+
&mut abstract_vars,
2618+
"data",
26062619
PatchAbstract {
26072620
abstract_id: abstract_item.abstract_id,
26082621
work_id: abstract_item.work_id,
@@ -2613,36 +2626,43 @@ fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() {
26132626
abstract_type: abstract_item.abstract_type,
26142627
canonical: abstract_item.canonical,
26152628
},
2616-
MarkupFormat::PlainText,
26172629
);
2618-
2619-
let stored_abstract = Abstract::from_id(pool.as_ref(), &seed.abstract_short_id).unwrap();
2620-
assert_eq!(
2621-
stored_abstract.content,
2622-
"<p>First line<break/>Second line with <inline-formula><tex-math>E=mc^2</tex-math></inline-formula> and <email>user@example.org</email> and <uri>https://example.org</uri></p>"
2630+
insert_var(&mut abstract_vars, "markup", MarkupFormat::PlainText);
2631+
let (_, abstract_errors) =
2632+
juniper::execute_sync(abstract_query, None, &schema, &abstract_vars, &context)
2633+
.expect("GraphQL execution should succeed with validation errors");
2634+
assert!(
2635+
!abstract_errors.is_empty(),
2636+
"Expected abstract validation error"
26232637
);
26242638

26252639
let biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap();
2626-
update_with_data_and_markup(
2627-
&schema,
2628-
&context,
2629-
"updateBiography",
2630-
"PatchBiography",
2631-
"biographyId",
2640+
let biography_query = r#"
2641+
mutation UpdateBiography($data: PatchBiography!, $markup: MarkupFormat!) {
2642+
updateBiography(data: $data, markupFormat: $markup) {
2643+
biographyId
2644+
}
2645+
}
2646+
"#;
2647+
let mut biography_vars = Variables::new();
2648+
insert_var(
2649+
&mut biography_vars,
2650+
"data",
26322651
PatchBiography {
26332652
biography_id: biography.biography_id,
26342653
contribution_id: biography.contribution_id,
26352654
content: "<p>Bio line<break/><inline-formula><tex-math>x^2</tex-math></inline-formula> <email>bio@example.org</email> <uri>https://bio.example.org</uri></p>".to_string(),
26362655
canonical: biography.canonical,
26372656
locale_code: biography.locale_code,
26382657
},
2639-
MarkupFormat::JatsXml,
26402658
);
2641-
2642-
let stored_biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap();
2643-
assert_eq!(
2644-
stored_biography.content,
2645-
"<p>Bio line<break/><inline-formula><tex-math>x^2</tex-math></inline-formula> <email>bio@example.org</email> <uri>https://bio.example.org</uri></p>"
2659+
insert_var(&mut biography_vars, "markup", MarkupFormat::JatsXml);
2660+
let (_, biography_errors) =
2661+
juniper::execute_sync(biography_query, None, &schema, &biography_vars, &context)
2662+
.expect("GraphQL execution should succeed with validation errors");
2663+
assert!(
2664+
!biography_errors.is_empty(),
2665+
"Expected biography validation error"
26462666
);
26472667
}
26482668

@@ -3111,7 +3131,7 @@ fn graphql_mutations_cover_all() {
31113131
"PatchAbstract",
31123132
"abstractId",
31133133
patch_abstract(&abstract_item),
3114-
MarkupFormat::PlainText,
3134+
MarkupFormat::JatsXml,
31153135
);
31163136

31173137
let biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap();
@@ -3122,7 +3142,7 @@ fn graphql_mutations_cover_all() {
31223142
"PatchBiography",
31233143
"biographyId",
31243144
patch_biography(&biography),
3125-
MarkupFormat::PlainText,
3145+
MarkupFormat::JatsXml,
31263146
);
31273147

31283148
let work = Work::from_id(pool.as_ref(), &seed.book_work_id).unwrap();

0 commit comments

Comments
 (0)