Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
IMAGE_API_SLASH_SUB = '-~' # Must always correspond to the Cantaloupe setting CANTALOUPE_SLASH_SUBSTITUTE
IMAGE_PDF_MEMORY_LIMIT = 100*1024*1024 #Limit on memory used to create the pdf in bytes
IMAGE_PDF_PAGE_LIMIT = 100 # Limit pn number of pages which can be downloaded as pdf
IMAGE_PROXY_RETRIES = 1 # Number of retries for failed upstream image requests (Cantaloupe cold-cache)
IMAGE_PROXY_RETRY_DELAY = 2 # Seconds to wait between retries

SQLALCHEMY_DATABASE_URI = 'postgres://scan_explorer:scan_explorer@postgres_service/scan_explorer_service'
SQLALCHEMY_TRACK_MODIFICATIONS = False
Expand Down
28 changes: 24 additions & 4 deletions scan_explorer_service/manifest_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
class ManifestFactoryExtended(ManifestFactory):
""" Extended manifest factory.

Extension of the iiif_prezi manifest factory with helper
Extension of the iiif_prezi manifest factory with helper
functions used to create manifest objects from model.
"""

def create_manifest(self, item: Union[Article, Collection]):
self.canvas_dict = {}
manifest = self.manifest(
ident=f'{item.id}/manifest.json', label=item.id)
manifest.description = item.id
Expand All @@ -20,10 +21,29 @@ def create_manifest(self, item: Union[Article, Collection]):
manifest.add_range(range)
return manifest

def create_collection_manifest(self, collection, pages, articles, article_pages):
self.canvas_dict = {}
manifest = self.manifest(
ident=f'{collection.id}/manifest.json', label=collection.id)
manifest.description = collection.id

sequence: Sequence = self.sequence()
for page in pages:
sequence.add_canvas(self.get_or_create_canvas(page))
manifest.add_sequence(sequence)

for article in articles:
range: Range = self.range(ident=article.bibcode, label=article.bibcode)
for page in article_pages.get(article.id, []):
range.add_canvas(self.get_or_create_canvas(page))
manifest.add_range(range)

return manifest

def create_sequence(self, item: Union[Article, Collection]):
sequence: Sequence = self.sequence()
for page in item.pages:
sequence.add_canvas(self.get_or_create_canvas(page))
sequence.add_canvas(self.get_or_create_canvas(page))
return sequence

def create_range(self, item: Union[Article, Collection]):
Expand Down Expand Up @@ -67,7 +87,7 @@ def create_image_annotation(self, page: Page):

# Override default image quality and format set by prezi
image.id = image.id.replace(f'/default.jpg', f'/{page.image_color_quality}.tif')

image.format = page.format
image.height = page.height
image.width = page.width
Expand All @@ -76,5 +96,5 @@ def create_image_annotation(self, page: Page):
def add_search_service(self, manifest: Manifest, search_url: str):
context = 'http://iiif.io/api/search/1/context.json'
profile = 'http://iiif.io/api/search/1/search'

manifest.add_service(ident=search_url, context=context, profile=profile)
12 changes: 7 additions & 5 deletions scan_explorer_service/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,14 @@ def __init__(self, **kwargs):
@property
def serialized(self):
"""Return object data in serializeable format"""
first_page = self.pages.first()
return {
'id': self.id,
'type': 'collection',
'journal': self.journal,
'volume': self.volume,
'pages': self.pages.count(),
'thumbnail': self.pages.first().thumbnail_url
'thumbnail': first_page.thumbnail_url if first_page else None
}


Expand Down Expand Up @@ -103,12 +104,13 @@ def __init__(self, bibcode, collection_id):
@property
def serialized(self):
"""Return object data in serializeable format"""
first_page = self.pages.first()
return {
'id': self.id,
'type': 'article',
'bibcode': self.bibcode,
'pages': self.pages.count(),
'thumbnail': self.pages.first().thumbnail_url,
'thumbnail': first_page.thumbnail_url if first_page else None,
'collection_id': self.collection_id
}

Expand Down Expand Up @@ -156,11 +158,11 @@ def image_url(self):
@property
def image_path(self):
separator = current_app.config.get('IMAGE_API_SLASH_SUB', '%2F')
image_path = separator.join(self.image_path_basic[0])
image_path = separator.join(self.image_path_basic[0])
if self.color_type != PageColor.BW:
image_path += '.tif'
return image_path
return image_path

@property
def image_path_basic(self):
image_format = ''
Expand Down
172 changes: 172 additions & 0 deletions scan_explorer_service/tests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,177 @@ def test_search_article_with_highlight(self, OpenSearch):
self.assertEqual(expected_query, call_kwargs.get('body'))


class TestCollectionManifest(TestCaseDatabase):

def create_app(self):
from scan_explorer_service.app import create_app
return create_app(**{
'SQLALCHEMY_DATABASE_URI': self.postgresql_url,
'SQLALCHEMY_ECHO': False,
'TESTING': True,
'PROPAGATE_EXCEPTIONS': True,
'TRAP_BAD_REQUEST_ERRORS': True,
'PRESERVE_CONTEXT_ON_EXCEPTION': False
})

def setUp(self):
Base.metadata.drop_all(bind=self.app.db.engine)
Base.metadata.create_all(bind=self.app.db.engine)

self.collection = Collection(type='type', journal='journal', volume='volume')
self.app.db.session.add(self.collection)
self.app.db.session.commit()
self.app.db.session.refresh(self.collection)
self.collection_id = self.collection.id

self.article1 = Article(bibcode='1988ApJ...333..341R',
collection_id=self.collection_id)
self.article2 = Article(bibcode='1988ApJ...333..352Z',
collection_id=self.collection_id)
self.app.db.session.add(self.article1)
self.app.db.session.add(self.article2)
self.app.db.session.commit()
self.article1_bibcode = self.article1.bibcode
self.article2_bibcode = self.article2.bibcode

self.page1 = Page(name='page1', collection_id=self.collection_id, volume_running_page_num=1)
self.page1.width = 1000
self.page1.height = 1000
self.page1.label = '1'
self.page2 = Page(name='page2', collection_id=self.collection_id, volume_running_page_num=2)
self.page2.width = 1000
self.page2.height = 1000
self.page2.label = '2'
self.page3 = Page(name='page3', collection_id=self.collection_id, volume_running_page_num=3)
self.page3.width = 1000
self.page3.height = 1000
self.page3.label = '3'
self.app.db.session.add_all([self.page1, self.page2, self.page3])
self.app.db.session.commit()

self.article1.pages.append(self.page1)
self.article1.pages.append(self.page2)
self.article2.pages.append(self.page2)
self.article2.pages.append(self.page3)
self.app.db.session.commit()

def test_get_collection_manifest(self):
url = url_for("manifest.get_manifest", id=self.collection_id)
r = self.client.get(url)
data = json.loads(r.data)

self.assertStatus(r, 200)
self.assertEqual(data['@type'], 'sc:Manifest')
self.assertEqual(data['label'], self.collection_id)

canvases = data['sequences'][0]['canvases']
self.assertEqual(len(canvases), 3)

ranges = data['structures']
self.assertEqual(len(ranges), 2)
range_labels = [r['label'] for r in ranges]
self.assertIn(self.article1_bibcode, range_labels)
self.assertIn(self.article2_bibcode, range_labels)

def test_collection_manifest_page_order(self):
url = url_for("manifest.get_manifest", id=self.collection_id)
r = self.client.get(url)
data = json.loads(r.data)

canvases = data['sequences'][0]['canvases']
labels = [c['label'] for c in canvases]
self.assertEqual(labels, ['p. 1', 'p. 2', 'p. 3'])

def test_collection_manifest_range_canvases(self):
url = url_for("manifest.get_manifest", id=self.collection_id)
r = self.client.get(url)
data = json.loads(r.data)

ranges = {r['label']: r for r in data['structures']}

art1_range = ranges[self.article1_bibcode]
self.assertEqual(len(art1_range['canvases']), 2)

art2_range = ranges[self.article2_bibcode]
self.assertEqual(len(art2_range['canvases']), 2)

def test_collection_manifest_canvas_has_article_metadata(self):
url = url_for("manifest.get_manifest", id=self.collection_id)
r = self.client.get(url)
data = json.loads(r.data)

canvases = data['sequences'][0]['canvases']
page2_canvas = canvases[1]
metadata = {m['label']: m['value'] for m in page2_canvas['metadata']}
self.assertIn(self.article1_bibcode, metadata['Abstract'])
self.assertIn(self.article2_bibcode, metadata['Abstract'])

def test_collection_manifest_not_found(self):
url = url_for("manifest.get_manifest", id='nonexistent')
r = self.client.get(url)
self.assertStatus(r, 404)


class TestCanvasDictIsolation(TestCaseDatabase):
"""S1: Verify canvas_dict is reset between manifest calls on the singleton."""

def create_app(self):
from scan_explorer_service.app import create_app
return create_app(**{
'SQLALCHEMY_DATABASE_URI': self.postgresql_url,
'SQLALCHEMY_ECHO': False,
'TESTING': True,
'PROPAGATE_EXCEPTIONS': True,
'TRAP_BAD_REQUEST_ERRORS': True,
'PRESERVE_CONTEXT_ON_EXCEPTION': False
})

def setUp(self):
Base.metadata.drop_all(bind=self.app.db.engine)
Base.metadata.create_all(bind=self.app.db.engine)

self.col1 = Collection(type='type', journal='jrnlA', volume='0001')
self.col2 = Collection(type='type', journal='jrnlB', volume='0002')
self.app.db.session.add_all([self.col1, self.col2])
self.app.db.session.commit()

self.art1 = Article(bibcode='2000jrnlA...1..001A', collection_id=self.col1.id)
self.art2 = Article(bibcode='2000jrnlB...2..001B', collection_id=self.col2.id)
self.app.db.session.add_all([self.art1, self.art2])
self.app.db.session.commit()

p1 = Page(name='p1', collection_id=self.col1.id, volume_running_page_num=1)
p1.width = 100; p1.height = 100; p1.label = '1'
p2 = Page(name='p2', collection_id=self.col1.id, volume_running_page_num=2)
p2.width = 100; p2.height = 100; p2.label = '2'
p3 = Page(name='p3', collection_id=self.col2.id, volume_running_page_num=1)
p3.width = 100; p3.height = 100; p3.label = '1'
self.app.db.session.add_all([p1, p2, p3])
self.app.db.session.commit()

self.art1.pages.append(p1)
self.art1.pages.append(p2)
self.art2.pages.append(p3)
self.app.db.session.commit()

self.col1_id = self.col1.id
self.col2_id = self.col2.id

def test_sequential_manifests_have_isolated_canvases(self):
r1 = self.client.get(url_for("manifest.get_manifest", id=self.col1_id))
data1 = json.loads(r1.data)
canvases1 = data1['sequences'][0]['canvases']
self.assertEqual(len(canvases1), 2)

r2 = self.client.get(url_for("manifest.get_manifest", id=self.col2_id))
data2 = json.loads(r2.data)
canvases2 = data2['sequences'][0]['canvases']
self.assertEqual(len(canvases2), 1)

canvas_ids_1 = {c['@id'] for c in canvases1}
canvas_ids_2 = {c['@id'] for c in canvases2}
self.assertTrue(canvas_ids_1.isdisjoint(canvas_ids_2))


if __name__ == '__main__':
unittest.main()
74 changes: 74 additions & 0 deletions scan_explorer_service/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,5 +282,79 @@ def test_put_collection_deduplicates_articles(self):
self.assertEqual(len(links), 2)


class TestNullPageHandling(TestCaseDatabase):
"""Tests for S2, S3, S5: null dereference guards when articles/collections have no pages."""

def create_app(self):
from scan_explorer_service.app import create_app
return create_app(**{
'SQLALCHEMY_DATABASE_URI': self.postgresql_url,
'OPEN_SEARCH_URL': 'http://localhost:1234',
'OPEN_SEARCH_INDEX': 'test',
'SQLALCHEMY_ECHO': False,
'TESTING': True,
'PROPAGATE_EXCEPTIONS': True,
'TRAP_BAD_REQUEST_ERRORS': True,
'PRESERVE_CONTEXT_ON_EXCEPTION': False
})

def setUp(self):
Base.metadata.drop_all(bind=self.app.db.engine)
Base.metadata.create_all(bind=self.app.db.engine)

self.collection = Collection(type='type', journal='journal', volume='volume')
self.app.db.session.add(self.collection)
self.app.db.session.commit()
self.collection_id = self.collection.id

self.article = Article(bibcode='2000ApJ...001..001X',
collection_id=self.collection_id)
self.app.db.session.add(self.article)
self.app.db.session.commit()
self.article_bibcode = self.article.bibcode
self.article_id = self.article.id

def test_collection_serialized_no_pages(self):
"""S3: Collection.serialized returns thumbnail=None when no pages exist."""
with self.app.app_context():
col = self.app.db.session.query(Collection).get(self.collection_id)
data = col.serialized
self.assertIsNone(data['thumbnail'])
self.assertEqual(data['pages'], 0)

def test_article_serialized_no_pages(self):
"""S3: Article.serialized returns thumbnail=None when no pages exist."""
with self.app.app_context():
art = self.app.db.session.query(Article).get(self.article_id)
data = art.serialized
self.assertIsNone(data['thumbnail'])
self.assertEqual(data['pages'], 0)

def test_collection_thumbnail_no_pages(self):
"""S2: collection_thumbnail raises when collection has no pages."""
from scan_explorer_service.utils.db_utils import collection_thumbnail
with self.app.app_context():
with self.assertRaises(Exception) as ctx:
collection_thumbnail(self.app.db.session, self.collection_id)
self.assertIn('No pages found', str(ctx.exception))

def test_article_collection_no_pages(self):
"""S5 (related): article_collection returns 404 when article has no pages."""
url = url_for("metadata.article_collection", bibcode=self.article_bibcode)
r = self.client.get(url)
self.assertStatus(r, 404)
data = json.loads(r.data)
self.assertIn('no pages', data['message'].lower())

@patch('opensearchpy.OpenSearch')
def test_get_page_ocr_article_no_pages(self, OpenSearch):
"""S5: get_page_ocr returns 404 when article has no pages."""
url = url_for("metadata.get_page_ocr", id=self.article_id)
r = self.client.get(url)
self.assertStatus(r, 404)
data = json.loads(r.data)
self.assertIn('no pages', data['message'].lower())


if __name__ == '__main__':
unittest.main()
Loading