From 8e68c7bcc2686e73522ada128a02dfe34e398e18 Mon Sep 17 00:00:00 2001 From: Sphericalkat Date: Mon, 29 May 2023 13:34:42 +0530 Subject: [PATCH] fix(markup): only escape innermost markup Signed-off-by: Sphericalkat --- frontend/index.html | 2 +- pkg/converters/markup_converter.go | 23 +- pkg/converters/paragraph_converter.go | 9 +- response.json | 1335 +++++++++++-------------- 4 files changed, 580 insertions(+), 789 deletions(-) diff --git a/frontend/index.html b/frontend/index.html index 8672ceb..2e0717e 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -13,7 +13,7 @@ RIP Medium \n \n", - "type": "PRE", - "href": null, - "layout": null, - "markups": [], - "iframe": null, - "metadata": null - }, - { - "name": "beef", - "text": "With this implemented on the server, the only remaining task is to write some JavaScript to detect when our Deferred Data chunk arrives. We did this with a MutationObserver, which is an efficient way to observe DOM changes. Once the Deferred Data JSON element is detected, we parse the result and inject it into our application’s network data store. From the application’s perspective, it’s as though a normal network request has been completed.", + "name": "d513", + "text": "Apache Hudi is an open-source data management framework used to simplify incremental data processing and data pipeline development. This framework more efficiently manages business requirements like data lifecycle and improves data quality. Hudi enables you to manage data at the record-level on cloud based data lakes to simplify Change Data Capture (CDC) and streaming data ingestion and helps to handle data privacy use cases requiring record level updates and deletes. Data sets managed by Hudi are stored in a cloud storage bucket using open storage formats, while integrations with Presto, Apache Hive and/or Apache Spark gives near real-time access to updated data using familiar tools.", "type": "P", "href": null, "layout": null, @@ -721,19 +602,37 @@ { "title": "", "type": "A", - "href": "https://developer.mozilla.org/en-US/docs/Web/API/MutationObserver", + "href": "https://hive.apache.org", "userId": null, - "start": 156, - "end": 172, + "start": 596, + "end": 607, "anchorType": "LINK" + }, + { + "title": "", + "type": "A", + "href": "https://spark.apache.org", + "userId": null, + "start": 615, + "end": 627, + "anchorType": "LINK" + }, + { + "title": null, + "type": "STRONG", + "href": null, + "userId": null, + "start": 0, + "end": 11, + "anchorType": null } ], "iframe": null, "metadata": null }, { - "name": "64b4", - "text": "Watch out for `defer`", + "name": "9ebe", + "text": "Apache Spark is an open-source unified analytics engine for large-scale data processing. Spark provides an interface for programming clusters with implicit data parallelism and fault tolerance. Originally developed at the University of California, Berkeley’s AMPLab, the Spark codebase was later donated to the Apache Software Foundation, which has maintained it since.", "type": "P", "href": null, "layout": null, @@ -744,7 +643,7 @@ "href": null, "userId": null, "start": 0, - "end": 21, + "end": 12, "anchorType": null } ], @@ -752,8 +651,8 @@ "metadata": null }, { - "name": "3228", - "text": "You may notice that some tags are re-ordered from the Early Flush example. The script tags moved from the Early chunk to the Body chunk and no longer have the defer attribute. This attribute avoids render-blocking script execution by deferring scripts until after the HTML has been downloaded and parsed. This is suboptimal when using Deferred Data, as all of the visible content has already been received by the end of the Body chunk, and we no longer worry about render-blocking at that point. We can fix this by moving the script tags to the end of the Body chunk, and removing the defer attribute. Moving the tags later in the document does introduce a network waterfall, which we solved by adding preload tags into the Early chunk.", + "name": "1595", + "text": "Now, since we are building a solution on Google Cloud, the best way to go about this would be to use Google Cloud Dataproc. Google Cloud Dataproc is a managed service for processing large datasets, such as those used in big data initiatives. Dataproc is part of Google Cloud Platform, Google’s public cloud offering. Dataproc helps users process, transform and understand vast quantities of data.", "type": "P", "href": null, "layout": null, @@ -761,59 +660,19 @@ { "title": "", "type": "A", - "href": "https://developer.mozilla.org/en-US/docs/Web/HTML/Element/script#attributes", + "href": "https://cloud.google.com/dataproc", "userId": null, - "start": 159, - "end": 174, + "start": 101, + "end": 122, "anchorType": "LINK" }, - { - "title": "", - "type": "A", - "href": "https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel/preload", - "userId": null, - "start": 702, - "end": 709, - "anchorType": "LINK" - } - ], - "iframe": null, - "metadata": null - }, - { - "name": "1672", - "text": "Implementation Challenges", - "type": "H3", - "href": null, - "layout": null, - "markups": [], - "iframe": null, - "metadata": null - }, - { - "name": "d693", - "text": "Status codes and headers", - "type": "H3", - "href": null, - "layout": null, - "markups": [], - "iframe": null, - "metadata": null - }, - { - "name": "254a", - "text": "Early Flush prevents subsequent changes to the headers (e.g to redirect or change the status code). In the React + NodeJS world, it’s common to delegate redirects and error throwing to a React app rendered after the data has been fetched. This won’t work if you’ve already sent an early tag and a 200 OK status.", - "type": "P", - "href": null, - "layout": null, - "markups": [ { "title": null, - "type": "CODE", + "type": "STRONG", "href": null, "userId": null, - "start": 287, - "end": 293, + "start": 149, + "end": 240, "anchorType": null } ], @@ -821,59 +680,36 @@ "metadata": null }, { - "name": "6ddd", - "text": "We solved this problem by moving error and redirect logic out of our React app. That logic is now performed in Express server middleware before we attempt to Early Flush.", + "name": "4239", + "text": "Inside the Google Dataproc instance, Spark and all the required libraries are preinstalled. After we have created the instance, we can run the following spark job in it to complete our pipeline:", "type": "P", "href": null, "layout": null, - "markups": [ - { - "title": "", - "type": "A", - "href": "https://expressjs.com/en/guide/using-middleware.html", - "userId": null, - "start": 111, - "end": 136, - "anchorType": "LINK" - } - ], - "iframe": null, - "metadata": null - }, - { - "name": "802d", - "text": "Buffering", - "type": "H3", - "href": null, - "layout": null, "markups": [], "iframe": null, "metadata": null }, { - "name": "f77b", - "text": "We found that nginx buffer responses by default. This has resource utilization benefits but is counterproductive when the goal is sending incremental responses. We had to configure these services to disable buffering. We expected a potential increase in resource usage with this change but found the impact to be negligible.", - "type": "P", + "name": "9d97", + "text": "", + "type": "IFRAME", "href": null, - "layout": null, - "markups": [ - { - "title": "", - "type": "A", - "href": "https://www.nginx.com/resources/wiki/start/topics/examples/x-accel/#x-accel-buffering", - "userId": null, - "start": 14, - "end": 19, - "anchorType": "LINK" + "layout": "INSET_CENTER", + "markups": [], + "iframe": { + "mediaResource": { + "href": "https://gist.github.com/mdhishaamakhtar/d64a299e4beda9a919df0d6a0ae29ec8", + "iframeSrc": "", + "iframeWidth": 0, + "iframeHeight": 0 } - ], - "iframe": null, + }, "metadata": null }, { - "name": "534f", - "text": "Response delays", - "type": "H3", + "name": "3e0c", + "text": "This would run a spark job that fetches the data from the Kafka that we pushed earlier to and writes it to a Google Cloud Storage Bucket. We have to specify the Kafka Topic, the Schema Registry URL and other relevant configurations.", + "type": "P", "href": null, "layout": null, "markups": [], @@ -881,54 +717,7 @@ "metadata": null }, { - "name": "4a34", - "text": "We noticed that our Early Flush responses had an unexpected delay of around 200ms, which disappeared when we disabled gzip compression. This turned out to be an interaction between Nagle’s algorithm and Delayed ACK. These optimizations attempt to maximize data sent per packet, introducing latency when sending small amounts of data. It’s especially easy to run into this issue with jumbo frames, which increases maximum packet sizes. It turns out that gzip reduced the size of our writes to the point where they couldn’t fill a packet, and the solution was to disable Nagle’s algorithm in our haproxy load balancer.", - "type": "P", - "href": null, - "layout": null, - "markups": [ - { - "title": "", - "type": "A", - "href": "https://en.wikipedia.org/wiki/Nagle%27s_algorithm", - "userId": null, - "start": 181, - "end": 198, - "anchorType": "LINK" - }, - { - "title": "", - "type": "A", - "href": "https://en.wikipedia.org/wiki/TCP_delayed_acknowledgment", - "userId": null, - "start": 203, - "end": 214, - "anchorType": "LINK" - }, - { - "title": "", - "type": "A", - "href": "https://en.wikipedia.org/wiki/Jumbo_frame", - "userId": null, - "start": 383, - "end": 395, - "anchorType": "LINK" - }, - { - "title": "", - "type": "A", - "href": "https://www.haproxy.com/documentation/hapee/latest/onepage/#4.2-option%20http-no-delay", - "userId": null, - "start": 594, - "end": 601, - "anchorType": "LINK" - } - ], - "iframe": null, - "metadata": null - }, - { - "name": "72d0", + "name": "6bfa", "text": "Conclusion", "type": "H3", "href": null, @@ -938,8 +727,8 @@ "metadata": null }, { - "name": "afbd", - "text": "HTTP Streaming has been a very successful strategy for improving web performance at Airbnb. Our experiments showed that Early Flush produced a flat reduction in First Contentful Paint (FCP) of around 100ms on every page tested, including the Airbnb homepage. Data streaming further eliminated the FCP costs of slow backend queries. While there were challenges along the way, we found that adapting our existing React application to support streaming was very feasible and robust, despite not being designed for it originally. We’re also excited to see the broader frontend ecosystem trend in the direction of prioritizing streaming, from @defer and @stream in GraphQL to streaming SSR in Next.js. Whether you’re using these new technologies, or extending an existing codebase, we hope you’ll explore streaming to build a faster frontend for all!", + "name": "647d", + "text": "There are several ways in which a data lake can be architected. I have tried to show how to build a data lake using Debezium, Kafka, Hudi, Spark and Google Cloud. Using a setup like this, one can easily scale the pipeline to manage huge data workloads! For more details into each technology, the documentation can be visited. The Spark Job can be customized to have much more fine-grained control. The Hudi shown here can also be integrated with Presto, Hive or Trino. The number of customizations are endless. This article provides one with a basic intro on how one can build a basic data pipeline using the above tools!", "type": "P", "href": null, "layout": null, @@ -947,28 +736,64 @@ { "title": "", "type": "A", - "href": "https://web.dev/fcp/", + "href": "https://debezium.io", "userId": null, - "start": 161, - "end": 183, + "start": 116, + "end": 124, "anchorType": "LINK" }, { "title": "", "type": "A", - "href": "https://graphql.org/blog/2020-12-08-improving-latency-with-defer-and-stream-directives/", + "href": "https://kafka.apache.org", "userId": null, - "start": 638, - "end": 667, + "start": 126, + "end": 131, "anchorType": "LINK" }, { "title": "", "type": "A", - "href": "https://nextjs.org/docs/advanced-features/react-18/streaming", + "href": "https://hudi.apache.org", "userId": null, - "start": 671, - "end": 695, + "start": 133, + "end": 137, + "anchorType": "LINK" + }, + { + "title": "", + "type": "A", + "href": "https://spark.apache.org", + "userId": null, + "start": 139, + "end": 144, + "anchorType": "LINK" + }, + { + "title": "", + "type": "A", + "href": "https://prestodb.io", + "userId": null, + "start": 446, + "end": 452, + "anchorType": "LINK" + }, + { + "title": "", + "type": "A", + "href": "https://hive.apache.org", + "userId": null, + "start": 454, + "end": 458, + "anchorType": "LINK" + }, + { + "title": "", + "type": "A", + "href": "https://trino.io", + "userId": null, + "start": 462, + "end": 467, "anchorType": "LINK" } ], @@ -976,8 +801,18 @@ "metadata": null }, { - "name": "ecb2", - "text": "If this type of work interests you, check out some of our related positions here.", + "name": "816f", + "text": "If you’ve enjoyed this story, please click the 👏 button and share it, so that others can find it as well! Also, feel free to leave a comment below.", + "type": "P", + "href": null, + "layout": null, + "markups": [], + "iframe": null, + "metadata": null + }, + { + "name": "1ef7", + "text": "Groww Engineering publishes technical anecdotes, the latest technologies, and better ways to tackle common programming problems. You can subscribe here to get the latest updates.", "type": "P", "href": null, "layout": null, @@ -985,65 +820,15 @@ { "title": "", "type": "A", - "href": "https://careers.airbnb.com/", + "href": "https://medium.com/groww-engineering", "userId": null, - "start": 76, - "end": 80, + "start": 137, + "end": 151, "anchorType": "LINK" } ], "iframe": null, "metadata": null - }, - { - "name": "55ba", - "text": "Acknowledgments", - "type": "H3", - "href": null, - "layout": null, - "markups": [], - "iframe": null, - "metadata": null - }, - { - "name": "2e78", - "text": "Elliott Sprehn, Aditya Punjani, Jason Jian, Changgeng Li, Siyuan Zhou, Bruce Paul, Max Sadrieh, and everyone else who helped design and implement streaming at Airbnb!", - "type": "P", - "href": null, - "layout": null, - "markups": [], - "iframe": null, - "metadata": null - }, - { - "name": "1e8d", - "text": "****************", - "type": "H3", - "href": null, - "layout": null, - "markups": [], - "iframe": null, - "metadata": null - }, - { - "name": "1416", - "text": "All product names, logos, and brands are property of their respective owners. All company, product and service names used in this website are for identification purposes only. Use of these names, logos, and brands does not imply endorsement.", - "type": "P", - "href": null, - "layout": null, - "markups": [ - { - "title": null, - "type": "EM", - "href": null, - "userId": null, - "start": 0, - "end": 241, - "anchorType": null - } - ], - "iframe": null, - "metadata": null } ] }