Skip to content

Commit d7a23c2

Browse files
committed
feat: use stream for downloading and uploading.
1 parent 68672eb commit d7a23c2

2 files changed

Lines changed: 149 additions & 6 deletions

File tree

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
defmodule ExCubicIngestion.Downloader do
2+
@moduledoc """
3+
Stream wrapper around HTTPoison.get!(...) that will download at least
4+
@min_stream_chunk_size of data before sending to stream.
5+
6+
Modified from source: https://elixirforum.com/t/how-to-stream-file-from-aws-to-client-through-elixir-backend/20693/15?u=bfolkens
7+
"""
8+
9+
# minimum required for multipart upload to S3
10+
@min_stream_chunk_size 5 * 1024 * 1024
11+
12+
@doc """
13+
Main function of module. Allows for creating a stream from an HTTPoison get!
14+
"""
15+
@spec stream!(String.t(), module()) :: Enumerable.t()
16+
def stream!(url, lib_httpoison \\ HTTPoison) do
17+
Stream.resource(
18+
# get async with httpoison to initiate stream
19+
fn ->
20+
%{
21+
ref: lib_httpoison.get!(url, %{}, stream_to: self(), async: :once),
22+
stream_chunk: nil,
23+
received_chunks_size: 0,
24+
content_length: 0
25+
}
26+
end,
27+
# construct stream
28+
fn acc ->
29+
case receive_response(acc.ref) do
30+
# returning the chunk to the stream
31+
{:ok, {:chunk, response_chunk}} ->
32+
process_chunk(response_chunk, acc, lib_httpoison)
33+
34+
# extract content length from header, so we can make a determination if
35+
# we have received all data
36+
{:ok, {:headers, headers}} ->
37+
process_headers(headers, acc, lib_httpoison)
38+
39+
# for all other messages ignore by not sending anything to the stream
40+
{:ok, msg} ->
41+
process_status(msg, acc, lib_httpoison)
42+
43+
{:error, error} ->
44+
raise("Error during download: #{inspect(error)}")
45+
46+
:done ->
47+
{:halt, acc.ref}
48+
end
49+
end,
50+
# lastly, close out request
51+
fn ref ->
52+
:hackney.stop_async(ref)
53+
end
54+
)
55+
end
56+
57+
defp receive_response(ref) do
58+
id = ref.id
59+
60+
receive do
61+
%HTTPoison.AsyncStatus{code: code, id: ^id} when code >= 200 and code < 300 ->
62+
{:ok, {:status_code, code}}
63+
64+
%HTTPoison.AsyncStatus{code: code, id: ^id} ->
65+
{:error, {:status_code, code}}
66+
67+
%HTTPoison.AsyncHeaders{headers: headers, id: ^id} ->
68+
{:ok, {:headers, headers}}
69+
70+
%HTTPoison.AsyncChunk{chunk: chunk, id: ^id} ->
71+
{:ok, {:chunk, chunk}}
72+
73+
%HTTPoison.AsyncEnd{id: ^id} ->
74+
:done
75+
end
76+
end
77+
78+
defp process_chunk(response_chunk, acc, lib_httpoison) do
79+
# initialize stream chunk if nil
80+
updated_stream_chunk =
81+
if is_nil(acc.stream_chunk) do
82+
response_chunk
83+
else
84+
acc.stream_chunk <> response_chunk
85+
end
86+
87+
# update how much data we have received so far
88+
updated_received_chunks_size = acc.received_chunks_size + byte_size(response_chunk)
89+
90+
# send signal to continue download
91+
lib_httpoison.stream_next(acc.ref)
92+
93+
cond do
94+
# if we are over the minimum required for us to send chunk to stream,
95+
# send it to stream
96+
byte_size(updated_stream_chunk) >= @min_stream_chunk_size ->
97+
{
98+
[updated_stream_chunk],
99+
%{acc | stream_chunk: nil, received_chunks_size: updated_received_chunks_size}
100+
}
101+
102+
# if we have received all data, send what's left to the stream
103+
updated_received_chunks_size == acc.content_length ->
104+
{
105+
[updated_stream_chunk],
106+
%{
107+
acc
108+
| stream_chunk: updated_stream_chunk,
109+
received_chunks_size: updated_received_chunks_size
110+
}
111+
}
112+
113+
# for everything else, keep building up the chunk
114+
true ->
115+
{
116+
[],
117+
%{
118+
acc
119+
| stream_chunk: updated_stream_chunk,
120+
received_chunks_size: updated_received_chunks_size
121+
}
122+
}
123+
end
124+
end
125+
126+
defp process_headers(headers, acc, lib_httpoison) do
127+
# look through headers to get content length
128+
content_length_from_header =
129+
Enum.find_value(headers, fn {name, val} ->
130+
if name == "Content-Length", do: String.to_integer(val)
131+
end)
132+
133+
# send signal to continue download
134+
lib_httpoison.stream_next(acc.ref)
135+
136+
{[], %{acc | content_length: content_length_from_header || 0}}
137+
end
138+
139+
defp process_status(_msg, acc, lib_httpoison) do
140+
lib_httpoison.stream_next(acc.ref)
141+
142+
{[], acc}
143+
end
144+
end

ex_cubic_ingestion/lib/ex_cubic_ingestion/workers/fetch_dmap.ex

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,11 @@ defmodule ExCubicIngestion.Workers.FetchDmap do
118118

119119
prefix_incoming = Application.fetch_env!(:ex_cubic_ingestion, :s3_bucket_prefix_incoming)
120120

121-
resp = lib_httpoison.get!(dataset_url)
122-
123-
bucket_incoming
124-
|> ExAws.S3.put_object(
125-
"#{prefix_incoming}cubic/dmap/#{dataset_rec.type}/#{dataset_rec.identifier}.csv.gz",
126-
resp.body
121+
dataset_url
122+
|> Downloader.stream!(lib_httpoison)
123+
|> ExAws.S3.upload(
124+
bucket_incoming,
125+
"#{prefix_incoming}cubic/dmap/#{dataset_rec.type}/#{dataset_rec.identifier}.csv.gz"
127126
)
128127
|> lib_ex_aws.request!()
129128

0 commit comments

Comments
 (0)