-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathort_inference.cpp
More file actions
165 lines (134 loc) · 6.72 KB
/
Copy pathort_inference.cpp
File metadata and controls
165 lines (134 loc) · 6.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#include <Eigen/Dense>
#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
#include <unsupported/Eigen/CXX11/Tensor>
// this is the nmp model baked into a header file
#include "basicpitch.hpp"
#include "model.ort.h"
using namespace basic_pitch::constants;
static Eigen::Tensor2dXf
unwrap_output(const Eigen::Tensor3dRowMajorXf &tensor_3d,
int audio_original_length, int n_overlapping_frames)
{
int batch_size = tensor_3d.dimension(0); // Number of batches (chunks)
int n_times_short =
tensor_3d.dimension(1); // Number of time steps per chunk
int n_freqs = tensor_3d.dimension(2); // Frequency bins
int n_olap = n_overlapping_frames / 2;
// Remove overlapping frames from both start and end
Eigen::array<int, 3> offsets = {0, n_olap, 0};
Eigen::array<int, 3> extents = {batch_size, n_times_short - 2 * n_olap,
n_freqs};
Eigen::Tensor<float, 3, Eigen::RowMajor> output_sliced =
tensor_3d.slice(offsets, extents);
// Flatten the 3D tensor into a 2D tensor
int total_time_steps = batch_size * (n_times_short - 2 * n_olap);
Eigen::Tensor<float, 2, Eigen::RowMajor> unwrapped_output =
output_sliced.reshape(Eigen::array<int, 2>{total_time_steps, n_freqs});
// Calculate the expected output length
int n_output_frames_original = static_cast<int>(
std::floor(audio_original_length *
(ANNOTATIONS_FPS / static_cast<float>(AUDIO_SAMPLE_RATE))));
n_output_frames_original =
std::min(n_output_frames_original,
static_cast<int>(unwrapped_output.dimension(0)));
// Trim the output tensor to match the original audio length
Eigen::Tensor<float, 2, Eigen::RowMajor> final_output =
unwrapped_output.slice(
Eigen::array<int, 2>{0, 0},
Eigen::array<int, 2>{n_output_frames_original, n_freqs});
// Return the final output as a column-major tensor
return final_output.swap_layout().shuffle(Eigen::array<int, 2>{1, 0});
}
basic_pitch::InferenceResult
basic_pitch::ort_inference(const std::vector<float> &mono_audio)
{
return ort_inference(mono_audio.data(), mono_audio.size());
}
basic_pitch::InferenceResult basic_pitch::ort_inference(const float *mono_audio,
int length)
{
// Initialize ONNX Runtime environment
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "basic_pitch");
// Set session options (use defaults)
Ort::SessionOptions session_options;
// Create the ONNX Runtime session from the in-memory ORT model
Ort::Session session(env, model_ort_start, model_ort_size, session_options);
// Constants for processing; overlap 30 frames
const int chunk_size = AUDIO_N_SAMPLES;
int n_overlapping_frames = 30;
int overlap_len = n_overlapping_frames * FFT_HOP;
int hop_size = AUDIO_N_SAMPLES - overlap_len;
// Padding the start of the audio (overlap_len / 2 zeros at the start)
std::vector<float> padded_audio(overlap_len / 2, 0.0f);
padded_audio.insert(padded_audio.end(), mono_audio, mono_audio + length);
// Calculate the new length after padding
int padded_length = padded_audio.size();
int num_chunks = (padded_length + hop_size - 1) / hop_size;
Ort::AllocatorWithDefaultOptions allocator;
std::array<int64_t, 3> input_shape = {num_chunks, chunk_size, 1};
// Allocate ONNX tensor up front
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
allocator, input_shape.data(), input_shape.size());
float *ort_tensor_data = input_tensor.GetTensorMutableData<float>();
for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx)
{
int start_pos = chunk_idx * hop_size;
int actual_chunk_size = std::min(chunk_size, padded_length - start_pos);
std::copy(padded_audio.begin() + start_pos,
padded_audio.begin() + start_pos + actual_chunk_size,
ort_tensor_data + chunk_idx * chunk_size); // ORT tensor
// Zero-pad the last chunk if it's smaller than chunk_size
if (actual_chunk_size < chunk_size)
{
std::fill(ort_tensor_data + chunk_idx * chunk_size +
actual_chunk_size,
ort_tensor_data + (chunk_idx + 1) * chunk_size, 0.0f);
}
}
// Input and output names
const char *input_names[] = {"serving_default_input_2:0"};
const char *output_names[] = {
"StatefulPartitionedCall:1", // note
"StatefulPartitionedCall:2", // onset
"StatefulPartitionedCall:0" // contour
};
// Run the inference
auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names,
&input_tensor, 1, output_names, 3);
// Retrieve and process shapes for each output
std::vector<int64_t> note_shape =
output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
std::vector<int64_t> onset_shape =
output_tensors[1].GetTensorTypeAndShapeInfo().GetShape();
std::vector<int64_t> contour_shape =
output_tensors[2].GetTensorTypeAndShapeInfo().GetShape();
int batch_size = note_shape[0];
int n_times_short_notes =
note_shape[1]; // Number of time steps for notes and onsets
int n_freqs_notes = note_shape[2]; // 88 for notes and onsets
int n_times_short_contours = contour_shape[1];
int n_freqs_contours = contour_shape[2]; // 264 for contours
// Get the original length of the audio (in samples)
int audio_original_length = length;
// Access raw output data
float *note_data = output_tensors[0].GetTensorMutableData<float>();
float *onset_data = output_tensors[1].GetTensorMutableData<float>();
float *contour_data = output_tensors[2].GetTensorMutableData<float>();
// Use Eigen::TensorMap to map the ONNX Runtime row-major data
Eigen::TensorMap<Eigen::Tensor3dRowMajorXf> note_tensor(
note_data, batch_size, n_times_short_notes, n_freqs_notes);
Eigen::TensorMap<Eigen::Tensor3dRowMajorXf> onset_tensor(
onset_data, batch_size, n_times_short_notes, n_freqs_notes);
Eigen::TensorMap<Eigen::Tensor3dRowMajorXf> contour_tensor(
contour_data, batch_size, n_times_short_contours, n_freqs_contours);
// Use unwrap_output to unwrap and convert the row-major 3D tensors to
// col-major 2D tensors
Eigen::Tensor2dXf unwrapped_notes =
unwrap_output(note_tensor, audio_original_length, 30);
Eigen::Tensor2dXf unwrapped_onsets =
unwrap_output(onset_tensor, audio_original_length, 30);
Eigen::Tensor2dXf unwrapped_contours =
unwrap_output(contour_tensor, audio_original_length, 30);
return InferenceResult{unwrapped_notes, unwrapped_onsets,
unwrapped_contours};
}